一応動くサンプル。あんまりテストしてないので、取得できないパターンはあるかも。
覚えておきたいのはxpathの設定方法で、
どうやら
//head/title/text()
と書くとたまにうまくタイトルが取得できない場合があるようで、
そんな場合でも
//title/text()
という風に直接タグを呼び出したら取得できた。
import re import urllib2 from lxml import etree def get_title_from_url(url, lxml_tree=None): if lxml_tree is None: lxml_tree = get_parsetree_from_url(url) if lxml_tree is None: return None title = None title_block = lxml_tree.xpath("//title/text()") if title_block: title = title_block[0] title = re.compile("\r|\n", re.S|re.M).sub("", title) charset = guess_content_encoding(title) if charset is not None: title = title.decode(charset).encode("utf-8") if title is None: title = "Page Title not found" return title def get_parsetree_from_url(url): if url is None: return None try: html = urllib2.urlopen(url) except urllib2.HTTPError, e: return None html_data = html.read() charset = html.headers.getparam('charset') if charset is None: charset = guess_content_encoding(html_data) lxml_tree = etree.fromstring(html_data, parser=etree.HTMLParser(encoding=charset)) return lxml_tree def guess_content_encoding(html_content): encoding = None for i in ["utf-8", "shift_jis", "euc-jp"]: try: unicode(html_read_data, i) encoding = i break except Exception, e: pass return encoding