Yahoo
Yahoo!デベロッパーネットワークの日本語形態素解析を使って形態素解析をするクラスを作ってみた。
関数の引数にはデフォルト値を使って、URLやnamespaceはクラス変数に持たせるようにしてみた。
# coding: utf-8 from urllib2 import urlopen from urllib import urlencode from xml.etree import ElementTree as etree class yahooMA(): url = 'http://jlp.yahooapis.jp/MAService/V1/parse' ns = 'urn:yahoo:jp:jlp' def __init__(self,apikey=''): self.apikey = apikey def parse(self,sentence='',results='ma,uniq',filter='',uniq_response='',uniq_filter='',uniq_by_baseform=''): param = { 'appid': self.apikey, 'sentence':sentence, 'results':results, 'filter':filter, 'uniq_response':uniq_response, 'uniq_filter':uniq_filter, 'uniq_by_baseform':uniq_by_baseform } res = urlopen(yahooMA.url+'?'+urlencode(param)) tree = etree.fromstring(res.read()) wordlist = [] param = ['surface','reading','pos','baseform'] ns = yahooMA.ns for w in tree.findall('.//{%s}word_list/{%s}word' % (ns,ns)): word = {} for p in param: word[p] = w.findtext('./{%s}%s' % (ns,p)) wordlist.append(word) return wordlist
使い方は以下の通り
ma = yahooMA(apikey='[% YahooデベロッパーネットワークのアプリケーションID %]') ma.parse('庭には2羽ニワトリがいるのだーー')
実行結果
[{'reading': u'\u306b\u308f', 'pos': u'\u540d\u8a5e', 'surface': u'\u5ead', 'baseform': None}, {'reading': u'\u306b', 'pos': u'\u52a9\u8a5e', 'surface': u'\u306b', 'baseform': None}, {'reading': u'\u306f', 'pos': u'\u52a9\u8a5e', 'surface': u'\u306f', 'baseform': None}, {'reading': '2', 'pos': u'\u540d\u8a5e', 'surface': '2', 'baseform': None}, {'reading': u'\u308f', 'pos': u'\u63a5\u5c3e\u8f9e', 'surface': u'\u7fbd', 'baseform': None}, {'reading': u'\u306b\u308f\u3068\u308a', 'pos': u'\u540d\u8a5e', 'surface': u'\u30cb\u30ef\u30c8\u30ea', 'baseform': None}, {'reading': u'\u304c', 'pos': u'\u52a9\u8a5e', 'surface': u'\u304c', 'baseform': None}, {'reading': u'\u3044\u308b', 'pos': u'\u52d5\u8a5e', 'surface': u'\u3044\u308b', 'baseform': None}, {'reading': u'\u306e', 'pos': u'\u52a9\u8a5e', 'surface': u'\u306e', 'baseform': None}, {'reading': u'\u3060', 'pos': u'\u52a9\u52d5\u8a5e', 'surface': u'\u3060', 'baseform': None}, {'reading': u'\u30fc', 'pos': u'\u7279\u6b8a', 'surface': u'\u30fc', 'baseform': None}, {'reading': u'\u30fc', 'pos': u'\u7279\u6b8a', 'surface': u'\u30fc', 'baseform': None}, {'reading': '', 'pos': u'\u7279\u6b8a', 'surface': u'\u30fc', 'baseform': None}, {'reading': '', 'pos': u'\u540d\u8a5e', 'surface': '2', 'baseform': None}, {'reading': '', 'pos': u'\u52d5\u8a5e', 'surface': u'\u3044\u308b', 'baseform': None}, {'reading': '', 'pos': u'\u52a9\u8a5e', 'surface': u'\u304c', 'baseform': None}, {'reading': '', 'pos': u'\u52a9\u52d5\u8a5e', 'surface': u'\u3060', 'baseform': None}, {'reading': '', 'pos': u'\u52a9\u8a5e', 'surface': u'\u306b', 'baseform': None}, {'reading': '', 'pos': u'\u52a9\u8a5e', 'surface': u'\u306e', 'baseform': None}, {'reading': '', 'pos': u'\u52a9\u8a5e', 'surface': u'\u306f', 'baseform': None}, {'reading': '', 'pos': u'\u540d\u8a5e', 'surface': u'\u30cb\u30ef\u30c8\u30ea', 'baseform': None}, {'reading': '', 'pos': u'\u540d\u8a5e', 'surface': u'\u5ead', 'baseform': None}, {'reading': '', 'pos': u'\u63a5\u5c3e\u8f9e', 'surface': u'\u7fbd', 'baseform': None}]
奇麗に整形して表示できないかなぁ。