Yahoo

Yahoo!デベロッパーネットワークの日本語形態素解析を使って形態素解析をするクラスを作ってみた。
関数の引数にはデフォルト値を使って、URLやnamespaceはクラス変数に持たせるようにしてみた。

# coding: utf-8

from urllib2 import urlopen
from urllib import urlencode
from xml.etree import ElementTree as etree

class yahooMA():
  url = 'http://jlp.yahooapis.jp/MAService/V1/parse'
  ns = 'urn:yahoo:jp:jlp'
  def __init__(self,apikey=''):
    self.apikey = apikey
  def parse(self,sentence='',results='ma,uniq',filter='',uniq_response='',uniq_filter='',uniq_by_baseform=''):
    param = {
      'appid': self.apikey,
      'sentence':sentence,
      'results':results,
      'filter':filter,
      'uniq_response':uniq_response,
      'uniq_filter':uniq_filter,
      'uniq_by_baseform':uniq_by_baseform
    }
    res = urlopen(yahooMA.url+'?'+urlencode(param))
    tree = etree.fromstring(res.read())
    wordlist = []
    param = ['surface','reading','pos','baseform']
    ns = yahooMA.ns
    for w in tree.findall('.//{%s}word_list/{%s}word' % (ns,ns)):
      word = {}
      for p in param:
        word[p] = w.findtext('./{%s}%s' % (ns,p))
      wordlist.append(word)
    return wordlist

使い方は以下の通り

ma = yahooMA(apikey='[% YahooデベロッパーネットワークのアプリケーションID %]')
ma.parse('庭には2羽ニワトリがいるのだーー')

実行結果

[{'reading': u'\u306b\u308f', 'pos': u'\u540d\u8a5e', 'surface': u'\u5ead', 'baseform': None}, {'reading': u'\u306b', 'pos': u'\u52a9\u8a5e', 'surface': u'\u306b', 'baseform': None}, {'reading': u'\u306f', 'pos': u'\u52a9\u8a5e', 'surface': u'\u306f', 'baseform': None}, {'reading': '2', 'pos': u'\u540d\u8a5e', 'surface': '2', 'baseform': None}, {'reading': u'\u308f', 'pos': u'\u63a5\u5c3e\u8f9e', 'surface': u'\u7fbd', 'baseform': None}, {'reading': u'\u306b\u308f\u3068\u308a', 'pos': u'\u540d\u8a5e', 'surface': u'\u30cb\u30ef\u30c8\u30ea', 'baseform': None}, {'reading': u'\u304c', 'pos': u'\u52a9\u8a5e', 'surface': u'\u304c', 'baseform': None}, {'reading': u'\u3044\u308b', 'pos': u'\u52d5\u8a5e', 'surface': u'\u3044\u308b', 'baseform': None}, {'reading': u'\u306e', 'pos': u'\u52a9\u8a5e', 'surface': u'\u306e', 'baseform': None}, {'reading': u'\u3060', 'pos': u'\u52a9\u52d5\u8a5e', 'surface': u'\u3060', 'baseform': None}, {'reading': u'\u30fc', 'pos': u'\u7279\u6b8a', 'surface': u'\u30fc', 'baseform': None}, {'reading': u'\u30fc', 'pos': u'\u7279\u6b8a', 'surface': u'\u30fc', 'baseform': None}, {'reading': '', 'pos': u'\u7279\u6b8a', 'surface': u'\u30fc', 'baseform': None}, {'reading': '', 'pos': u'\u540d\u8a5e', 'surface': '2', 'baseform': None}, {'reading': '', 'pos': u'\u52d5\u8a5e', 'surface': u'\u3044\u308b', 'baseform': None}, {'reading': '', 'pos': u'\u52a9\u8a5e', 'surface': u'\u304c', 'baseform': None}, {'reading': '', 'pos': u'\u52a9\u52d5\u8a5e', 'surface': u'\u3060', 'baseform': None}, {'reading': '', 'pos': u'\u52a9\u8a5e', 'surface': u'\u306b', 'baseform': None}, {'reading': '', 'pos': u'\u52a9\u8a5e', 'surface': u'\u306e', 'baseform': None}, {'reading': '', 'pos': u'\u52a9\u8a5e', 'surface': u'\u306f', 'baseform': None}, {'reading': '', 'pos': u'\u540d\u8a5e', 'surface': u'\u30cb\u30ef\u30c8\u30ea', 'baseform': None}, {'reading': '', 'pos': u'\u540d\u8a5e', 'surface': u'\u5ead', 'baseform': None}, {'reading': '', 'pos': u'\u63a5\u5c3e\u8f9e', 'surface': u'\u7fbd', 'baseform': None}]

奇麗に整形して表示できないかなぁ。