soy-curd's blog

へぼプログラマーです [https://twitter.com/soycurd1]

python3で自然言語処理100本ノックの最初のほう その2

n-gramの書き方、もっと格好良い方法ある気がする。メモリバカ食いすぎる感。

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from functools import *
import random

def main():
    print(ngram("I am an NLPer"))
    print(ngram("I am an NLPer".split()))

    X = set(ngram("paraparaparadise"))
    Y = set(ngram("paragraph"))
    print(union_set(X, Y))
    print(intersection_set(X, Y))
    print(diff_set(X, Y))
    print(("s", "e") in X)
    print(("s", "e") in Y)

    print(template(12, "気温", 22.4))

    print(cipher("spam is 5000 yen."))
    print(cipher(cipher("spam is 5000 yen.")))

    print(typo("I couldn't believe that I could actually understand what I was reading : the phenomenal power of the human mind ."))


# 05 n-gram
def ngram(sequence):
    if isinstance(sequence, str):
        sequence2 = zip(" " + sequence, sequence + " ")
    elif isinstance(sequence, list):
        sequence2 = zip([""] + sequence, sequence + [""])
    else:
        return []

    return list(map(lambda x: (x[0], x[1]), sequence2))


# 06 集合
def union_set(set1, set2):
    return set1.union(set2)


def intersection_set(set1, set2):
    return set1.intersection(set2)


def diff_set(set1, set2):
    return set1.difference(set2)


# 07 テンプレートによる文生成
def template(x, y, z):
    return str(x) + "時の" + y + "は" + str(z)


# 08 暗号文
def cipher(string):
    return reduce(lambda a, x: a + replace_alpha(x), string, "")


def replace_alpha(char):
    if 97 <= ord(char) <= (97 + 26):
        return chr(219 - ord(char))
    else:
        return char


# 09 Typoglycemia
def typo(string):
    str_list = string.split()
    return reduce(lambda a, x: a + " " + x, map(inner_random, str_list), "")


def inner_random(string):
    if len(string) >= 4:
        body = list(string[1:-1])
        random.shuffle(body)
        return string[0] + reduce(lambda a, x: a + x, body, "") + string[-1]
    else:
        return string


if __name__=='__main__':
    main()