python3で自然言語処理100本ノックの最初のほう その2
n-gramの書き方、もっと格好良い方法ある気がする。メモリバカ食いすぎる感。
#!/usr/bin/env python3 # -*- coding: utf-8 -*- from functools import * import random def main(): print(ngram("I am an NLPer")) print(ngram("I am an NLPer".split())) X = set(ngram("paraparaparadise")) Y = set(ngram("paragraph")) print(union_set(X, Y)) print(intersection_set(X, Y)) print(diff_set(X, Y)) print(("s", "e") in X) print(("s", "e") in Y) print(template(12, "気温", 22.4)) print(cipher("spam is 5000 yen.")) print(cipher(cipher("spam is 5000 yen."))) print(typo("I couldn't believe that I could actually understand what I was reading : the phenomenal power of the human mind .")) # 05 n-gram def ngram(sequence): if isinstance(sequence, str): sequence2 = zip(" " + sequence, sequence + " ") elif isinstance(sequence, list): sequence2 = zip([""] + sequence, sequence + [""]) else: return [] return list(map(lambda x: (x[0], x[1]), sequence2)) # 06 集合 def union_set(set1, set2): return set1.union(set2) def intersection_set(set1, set2): return set1.intersection(set2) def diff_set(set1, set2): return set1.difference(set2) # 07 テンプレートによる文生成 def template(x, y, z): return str(x) + "時の" + y + "は" + str(z) # 08 暗号文 def cipher(string): return reduce(lambda a, x: a + replace_alpha(x), string, "") def replace_alpha(char): if 97 <= ord(char) <= (97 + 26): return chr(219 - ord(char)) else: return char # 09 Typoglycemia def typo(string): str_list = string.split() return reduce(lambda a, x: a + " " + x, map(inner_random, str_list), "") def inner_random(string): if len(string) >= 4: body = list(string[1:-1]) random.shuffle(body) return string[0] + reduce(lambda a, x: a + x, body, "") + string[-1] else: return string if __name__=='__main__': main()