Auto Byte

Science AI

# 玩转词向量：用fastText预训练向量做个智能小程序

fastText 通过阅读维基百科学习到了什么？让我们打开这个 2GB 的文件一探究竟：

1. good -0.1242 -0.0674 -0.1430 -0.0005 -0.0345 ...

2. day 0.0320 0.0381 -0.0299 -0.0745 -0.0624 ...

3. three 0.0304 0.0070 -0.0708 0.0689 -0.0005 ...

4. know -0.0370 -0.0138 0.0392 -0.0395 -0.1591 ...

5. ...

1. from typing import List

2. Vector = List[float]

3. class Word:

4.    def __init__(self, text: str, vector: Vector) -> None:

5.        self.text = text

6.        self.vector = vector

1. words = load_words('data/words.vec')

1. def vector_len(v: Vector) -> float:

2.    return math.sqrt(sum([x*x for x in v]))

3. def dot_product(v1: Vector, v2: Vector) -> float:

4.    assert len(v1) == len(v2)

5.    return sum([x*y for (x,y) in zip(v1, v2)])

6. def cosine_similarity(v1: Vector, v2: Vector) -> float:

7.    """

8.    Returns the cosine of the angle between the two vectors.

9.    Results range from -1 (very different) to 1 (very similar).

10.    """

11.    return dot_product(v1, v2) / (vector_len(v1) * vector_len(v2))

1. def sorted_by_similarity(words: List[Word], base_vector: Vector) -> List[Tuple[float, Word]]:

2.    """Returns words sorted by cosine distance to a given vector, most similar first"""

3.    words_with_distance = [(cosine_similarity(base_vector, w.vector), w) for w in words]

4.    # We want cosine similarity to be as large as possible (close to 1)

5.    return sorted(words_with_distance, key=lambda t: t[0], reverse=True)

1. def print_related(words: List[Word], text: str) -> None:

2.    base_word = find_word(text, words)

3.    sorted_words = [

4.        word.text for (dist, word) in

5.            sorted_by_similarity(words, base_word.vector)

6.            if word.text.lower() != base_word.text.lower()

7.        ]

8.    print(', '.join(sorted_words[:7]))

9. def find_word(words: List[Word], text: str) -> Word:

10.    return next(w for w in words if text == w.text)

1. >>> print_related(words, 'spain')

2. britain, england, france, europe, germany, spanish, italy

3. >>> print_related(words, 'called')

4. termed, dubbed, named, referred, nicknamed, titled, described

5. >>> print_related(words, 'although')

6. though, however, but, whereas, while, since, Nevertheless

7. >>> print_related(words, 'arms')

8. legs, arm, weapons, coat, coats, armaments, hands

9. >>> print_related(words, 'roots')

10. root, origins, stems, beginnings, rooted, grass, traditions

1. vector(“France”) - vector("Paris") = answer_vector - vector("Rome")

1. vector(“France”) - vector("Paris") + vector("Rome") = answer_vector

1. def closest_analogies(

2.    left2: str, left1: str, right2: str, words: List[Word]

3. ) -> List[Tuple[float, Word]]:

4.    word_left1 = find_word(left1, words)

5.    word_left2 = find_word(left2, words)

6.    word_right2 = find_word(right2, words)

7.    vector = add_vectors(

8.        sub_vectors(word_left1.vector, word_left2.vector),

9.        word_right2.vector)

10.    closest = sorted_by_similarity(words, vector)[:10]

11.    def is_redundant(word: str) -> bool:

12.        """

13.        Sometimes the two left vectors are so close the answer is e.g.

14.        "shirt-clothing is like phone-phones". Skip 'phones' and get the next

15.        suggestion, which might be more interesting.

16.        """

17.        return (

18.            left1.lower() in word.lower() or

19.            left2.lower() in word.lower() or

20.            right2.lower() in word.lower())

21.    return [(dist, w) for (dist, w) in closest if not is_redundant(w.text)]

22. def print_analogy(left2: str, left1: str, right2: str, words: List[Word]) -> None:

23.    analogies = closest_analogies(left2, left1, right2, words)

24.    if (len(analogies) == 0):

25.        print(f"{left2}-{left1} is like {right2}-?")

26.    else:

27.        (dist, w) = analogies[0]

28.        print(f"{left2}-{left1} is like {right2}-{w.text}")

1. >>> print_analogy('Paris', 'France', 'Rome', words)

2. Paris-France is like Rome-Italy

3. >>> print_analogy('man', 'king', 'woman', words)

4. man-king is like woman-queen

5. >>> print_analogy('walk', 'walked' , 'go', words)

6. walk-walked is like go-went

7. >>> print_analogy('quick', 'quickest' , 'far', words)

8. quick-quickest is like far-furthest

1. English-Jaguar is like German-BMW      // Expensive cars

2. English-Vauxhall is like German-Opel   // Cheaper cars

3. German-BMW is like American-Lexus      // Expensive cars

4. German-Opel is like American-Chrysler  // Cheaper cars

1. >>> print_analogy('dog', 'mammal', 'eagle', words)

2. dog-mammal is like eagle-bird

1. 寿司-米饭就像是披萨-___

2. 寿司-米饭就像是牛排-___

3. 衬衫-衣服就像是电话-___

4. 衬衫-衣服就像是碗-___

5. 书-阅读就像是电视-___

1. sushi-rice is like pizza-wheat      // Makes sense

2. sushi-rice is like steak-chicken

3. shirt-clothing is like bowl-food

4. shirt-clothing is like phone-mobile

5. book-reading is like TV-television

1. >>> print_analogy('do', 'done' , 'go', words)

2. do-done is like go-undertaken

1. sushi-rice is like steak-

2. [chicken (0.58), beef (0.56), potatoes (0.56), corn (0.55)]

3. book-reading is like TV-

4. [television (0.68), watching (0.64), listening (0.57), viewing (0.57)]

5. shirt-clothing is like bowl-[food, cereal, rice, porridge]

6. shirt-clothing is like phone-[mobile, cellular]

1. man-king is like woman-king

n-gram 和将词表示为向量等思想已经存在了很长时间，但直到 2013 年那篇 Word2vec 的论文和实现发表之后（https://arxiv.org/abs/1301.3781），才表明这些方法「能以远远更低的计算成本实现准确度的极大提升」。Tomas Mikolov 为 Word2Vec 和 fastText 这两个项目都立下过汗马功劳。

Python 3 runtime 接受类型注释，这一点非常棒。任何人都无需任何额外设置就能运行我的代码。在运行这些代码之前无法对其进行转换，就像使用 JavaScript 的 Flow 一样。

1. def load_words(file_path: str) -> List[Word]:

2.    """Load and cleanup the data."""

3.    words = load_words_raw(file_path)

4.    print(f"Loaded {len(words)} words.")

5.    words = remove_stop_words(words)

6.    print(f"Removed stop words, {len(words)} remain.")

7.    words = remove_duplicates(words)

8.    print(f"Removed duplicates, {len(words)} remain.")

9.    return words

load_words_raw 只是逐行读取文件，然后将每一行解析成一个词。我们加载了 10 万词（每个词 300 维），而这个 Python 进程就用了近 1GB 的内存！这很糟糕，但可以忍受。

remove_stop_words 会将起始或结尾字符不是字母的词移除，比如「inter-」、「thanks.」、「--redrose64」。现在剩下 98,648 个词。

remove_duplicates 会忽略标点符号，所以「U.K」、「U.K.」和「UK」是一样的词，只存储一次。剩下 97,190 个词。但仍然还有一些重复，比如「years」和「Years」。我们也可以忽略大小写，但这样我们就无法区分「us」（我们）和「US」（美国）这样的词了。