python3で自然言語処理100本ノック (第4章: 形態素解析)
python3で自然言語処理100本ノックをやった@Pythonもくもく会*1。 今回は第4章の形態素解析。
基本的には、MeCabで形態素解析した結果をいじくりまわすかんじ。 python3用のmatplotlib入れるのめんどくさかったので、適当に表示させてたら、 最後の問題で両対数グラフ描けとか言われたので詰んだ。matplotlib入れたら問39やります(片手落ち)。。。
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import MeCab def main(): # 30. 形態素解析結果の読込 txt = inputtxt("./neko.txt") morph = analyze(txt) morph_dics = dicnize(morph) for x in morph_dics: print(x) # 31. 動詞 vs = abstract_v(dicnize(morph)) # 32. 動詞の原型 vbases = abstract_vbase(vs) for x in vbases: print(x) # 33. サ変名詞 sahens = abstract_sahen(dicnize(morph)) for x in sahens: print(x) # 34. 「AのB」 nos = abstract_no(dicnize(morph)) for x in nos: print(x) # 35. 名詞の連接 n_seq = abstract_n_seqs(dicnize(morph)) for x in n_seq: print(x) # 36. 単語の出現頻度 tf = compute_tf(dicnize(morph)) sorted_tf = reversed(sorted(tf.items(), key=lambda x: x[1])) for k, v in sorted_tf: print(k) # 37. 頻度上位10語 sorted_tf = reversed(sorted(tf.items(), key=lambda x: x[1])) for x in range(10): graph(next(sorted_tf)) # 38. ヒストグラム sorted_tf = reversed(sorted(tf.items(), key=lambda x: x[1])) tf_histgram = make_hist(sorted_tf) i = 0 while tf_histgram: i = i + 1 if i in tf_histgram: print(str(i) + "\t\t" + "*" * tf_histgram[i]) tf_histgram.pop(i) else: print(str(i)) def graph(v): print(v[0] + "\t\t" + "*" * v[1]) def analyze(txt): ''' 形態素解析を行う ''' mt = MeCab.Tagger("-Ochasen") return mt.parse(txt) def dicnize(txt): ''' chasen形式の文字列を辞書に変換する ''' buf = txt.split("\n") dics = filter(lambda x: x, map(lambda x: texts2dic(x.split()), buf)) return dics def texts2dic(txt_list): if len(txt_list) < 4: return None morph_dic = {} morph_dic["surface"] = txt_list[0] morph_dic["pronunce"] = txt_list[1] morph_dic["base"] = txt_list[2] poses = txt_list[3].split("-") morph_dic["pos"] = poses[0] if len(poses) > 1: morph_dic["pos1"] = poses[1] return morph_dic def abstract_v(morph_dics): ''' 動詞を抽出する ''' return filter(lambda x: x["pos"] == "動詞", morph_dics) def abstract_vbase(vs): ''' 原型を抽出する ''' return map(lambda x: x["base"], vs) def abstract_sahen(morph_dics): ''' サ変接続を抽出する ''' ns = filter(lambda x: x["pos"] == "名詞", morph_dics) ns_pos1 = filter(lambda x: "pos1" in x, ns) return filter(lambda x: x["pos1"] == "サ変接続", ns_pos1) def abstract_no(morph_dics): ''' "の"が出現するインデックスを求めて その両脇が名詞の場合名詞句を返す ''' dics = list(morph_dics) nos = [] for i, x in enumerate(dics): if x["surface"] == "の": if is_between_n(i, dics): nos.append(dics[i-1]["surface"] + dics[i]["surface"] + dics[i+1]["surface"]) return nos def is_between_n(i, morph_dics): ''' 名詞と名詞の間かどうか判定 ''' if morph_dics[i-1]["pos"] == "名詞" and morph_dics[i+1]["pos"] == "名詞": return True else: return False def abstract_n_seqs(morph_dics): ''' 連続している名詞を抽出する ''' dics = list(morph_dics) seqs = [] seq = [] for i in range(len(dics)): if dics[i]["pos"] == "名詞": seq.append(dics[i]["surface"]) else: if seq: seqs.append(seq) seq = [] words = filter(lambda x: len(x) > 1, seqs) return map(lambda x: "".join(x), words) def compute_tf(morph_dics): ''' tfを計算する ''' tf = {} for x in morph_dics: if x["surface"] in tf: tf[x["surface"]] = tf[x["surface"]] + 1 else: tf[x["surface"]] = 1 return tf def make_hist(sorted_tf): ''' ヒストグラム作成に必要な値を計算する ''' tf_histgram = {} for x in sorted_tf: if x[1] in tf_histgram: tf_histgram[x[1]] = tf_histgram[x[1]] + 1 else: tf_histgram[x[1]] = 1 return tf_histgram def inputtxt(filepath): ''' テキスト読込を行う ''' with open(filepath, 'r') as f: txt = f.read() return txt if __name__=='__main__': main()