XPathベンチ
以下のようなベンチマークスクリプトを書いてみた。もっとスマートな方法でベンチマークできるといいな。
#-*- encoding: utf-8 -*- from xml.etree import ElementTree from lxml import etree from time import time def xpath_by_xml(fname): x = ElementTree.parse(fname) for item in x.findall('//item'): title = item.findtext('./title') link = item.findtext('./link') desc = item.findtext('./description') date = item.findtext('./pubDate') def xpath_by_lxml(fname): x = etree.parse(fname) for item in x.xpath('//item'): title = item.xpath('./title/text()')[0] link = item.xpath('./link/text()')[0] desc = item.xpath('./description/text()')[0] date = item.xpath('./pubDate/text()')[0] count = 3000 fname = '/tmp/fn7.rss' t = time() for i in xrange(0,count): xpath_by_xml(fname) result_of_xml = time() - t t = time() for i in xrange(0,count): xpath_by_lxml(fname) result_of_lxml = time() - t print "xml:lxml = %f:%f" % (result_of_xml/result_of_lxml,result_of_lxml/result_of_lxml)
結果は以下のとおり。
xml:lxml = 2.214859:1.000000
[追記]timeitを使ってみた
http://d.hatena.ne.jp/stog/20090927/1254067688 を参考にしてtimeitというのを使ってみた
#-*- encoding: utf-8 -*- from xml.etree import ElementTree from lxml import etree import timeit count = 300 fname = '/tmp/fn7.rss' def xpath_by_xml(fname): etree_by_xml = ElementTree.parse(fname) global finded finded = etree_by_xml.findall('//item') for item in finded: title = item.findtext('./title') link = item.findtext('./link') desc = item.findtext('./description') date = item.findtext('./pubDate') def xpath_by_lxml(fname): etree_by_lxml = etree.parse(fname) global finded finded = etree_by_lxml.xpath('//item') for item in finded: title = item.xpath('./title/text()')[0] link = item.xpath('./link/text()')[0] desc = item.xpath('./description/text()')[0] date = item.xpath('./pubDate/text()')[0] for func in ['xpath_by_xml','xpath_by_lxml']: time_eachcall = timeit.Timer( setup = ('from __main__ import %s' % func), stmt = ('%s("%s")' % (func,fname)) ) print func print " Time = %s " % time_eachcall.timeit(number=count) print " Items = %d" % len(finded)
結果
xpath_by_xml Time = 1.40109205246 Items = 10 xpath_by_lxml Time = 0.597398996353 Items = 10
parse処理を分離したものだと
xpath_by_xml Time = 0.105925798416 Items = 10 xpath_by_lxml Time = 0.461273908615 Items = 10
結果が逆転するw
[さらに追記]lxmlのプリコンパイルを使ってみた
timeitを参考にさせてもらったところではRSSの記事数を増加させたものを読み込ませてベンチしていたので
こちらも真似てやってみた。
#-*- encoding: utf-8 -*- from xml.etree import ElementTree from lxml import etree import timeit count = 100 fname = '/tmp/fn7_meny.rss' etree_by_xml = ElementTree.parse(fname) etree_by_lxml = etree.parse(fname) precompiled = { 'items': etree.XPath('//item'), 'title': etree.XPath('./title/text()'), 'link': etree.XPath('./link/text()'), 'desc': etree.XPath('./description/text()'), 'date': etree.XPath('./pubDate/text()'), } def xpath_by_xml(): global finded finded = etree_by_xml.findall('//item') for item in finded: title = item.findtext('./title') link = item.findtext('./link') desc = item.findtext('./description') date = item.findtext('./pubDate') def xpath_by_lxml(): global finded finded = etree_by_lxml.xpath('//item') for item in finded: title = item.xpath('./title/text()')[0] link = item.xpath('./link/text()')[0] desc = item.xpath('./description/text()')[0] date = item.xpath('./pubDate/text()')[0] def xpath_precompiled_by_lxml(): global finded finded = precompiled['items'](etree_by_lxml) for item in finded: title = precompiled['title'](item) link = precompiled['link'](item) desc = precompiled['desc'](item) date = precompiled['date'](item) for func in ['xpath_by_xml','xpath_by_lxml','xpath_precompiled_by_lxml']: time_eachcall = timeit.Timer( setup = ('from __main__ import %s' % func), stmt = ('%s()' % (func)) ) print func print " Time = %s " % time_eachcall.timeit(number=count) print " Items = %d" % len(finded)
結果プリコンパイルの効果はかなりあるみたいだけど、ElementTreeを使った方が速い。
とはいえlxmlの方が高機能だったりするので、lxmlを使うならプリコンパイルした方が良いということかな。
xpath_by_xml Time = 3.62736606598 Items = 1000 xpath_by_lxml Time = 15.6545221806 Items = 1000 xpath_precompiled_by_lxml Time = 4.25081205368 Items = 1000