The algorithm updates incrementally a unigram table, which Kaji
and Kobayashi proposed.
-
While the table is incomplete, it is updated as the original unigram table
algorithm.
-
If the table is complete, a random number n is selected, and n copies from the
word w are added to the array table.
References
1.Nobuhiro Kaji and Hayato Kobayashi. 2017. Incremental Skip-gram Model
with Negative Sampling. In Proceedings of the 2017 Conference on
Empirical Methods in Natural Language Processing, pages 363–371,
Copenhagen, Denmark. Association for Computational Linguistics.
Source code in rivertext/models/iword2vec/unigram_table.py
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125 | class UnigramTable:
"""The algorithm updates incrementally a unigram table, which Kaji
and Kobayashi proposed.
1. While the table is incomplete, it is updated as the original unigram table
algorithm.
2. If the table is complete, a random number n is selected, and n copies from the
word w are added to the array table.
References:
1.Nobuhiro Kaji and Hayato Kobayashi. 2017. Incremental Skip-gram Model
with Negative Sampling. In Proceedings of the 2017 Conference on
Empirical Methods in Natural Language Processing, pages 363–371,
Copenhagen, Denmark. Association for Computational Linguistics.
"""
def __init__(self, max_size: int = 100_000_000):
"""Initialize a Unigram Table instance.
Args:
max_size: Size of the unigram table, by default 100_000_000
Raises:
TypeError: The max size should be int number.
ValueError: The max size should be greater than 0.
"""
if not isinstance(max_size, int):
raise TypeError(f"max_size should be int, got {max_size}")
if max_size < 0:
raise ValueError(f"max_size should be greater than , got {max_size}")
self.max_size = max_size
self.size = 0
self.z = 0
self.table = np.zeros(self.max_size)
def sample(self) -> int:
"""Obtain a negative sample from the unigram table.
Returns:
Index of negative sample obtained.
"""
assert 0 < self.size
unigram_idx = self.table[np.random.randint(0, self.size)]
return unigram_idx
def samples(self, n: int) -> np.ndarray:
"""Obtain n negative samples from the unigram table
Args:
n: Number of negative samples.
Returns:
A array of negative samples.
"""
unigram_idxs = list(self.table[np.random.randint(0, self.size, size=n)])
return unigram_idxs
def build(self, vocab: Vocab, alpha: float) -> None:
"""Build a unigram table based on the vocabulary structure.
Args:
vocab: Vocabulary.
alpha: Smoothed parameter.
"""
reserved_idxs = set(vocab.counter.keys())
free_idxs = vocab.free_idxs
counts = vocab.counter.to_numpy(reserved_idxs | free_idxs)
vocab_size = len(counts)
counts_pow = np.power(counts, alpha)
z = np.sum(counts_pow)
nums = self.max_size * counts_pow / z
nums = np.vectorize(round_number)(nums)
sum_nums = np.sum(nums)
while self.max_size < sum_nums:
w = int(np.random.randint(0, vocab_size))
if 0 < nums[w]:
nums[w] -= 1
sum_nums -= 1
self.z = z
self.size = 0
for w in range(vocab_size):
self.table[self.size : self.size + nums[w]] = w
self.size += nums[w]
def update(self, word_idx: int, F: float) -> None:
"""Update the unigram table acording to the new words in the text stream.
Args:
word_idx: Index of the word to update in the unigram table.
F: Normalize value.
"""
assert 0 <= word_idx
assert 0.0 <= F
self.z += F
if self.size < self.max_size:
if F.is_integer():
copies = min(int(F), self.max_size)
self.table[self.size : self.size + copies] = word_idx
else:
copies = min(round_number(F), self.max_size)
self.table[self.size : self.size + copies] = word_idx
self.size += copies
else:
n = round_number((F / self.z) * self.max_size)
for _ in range(n):
table_idx = np.random.randint(0, self.max_size)
self.table[table_idx] = word_idx
|
__init__(max_size=100000000)
Initialize a Unigram Table instance.
Parameters:
Name |
Type |
Description |
Default |
max_size |
int
|
Size of the unigram table, by default 100_000_000 |
100000000
|
Raises:
Type |
Description |
TypeError
|
The max size should be int number. |
ValueError
|
The max size should be greater than 0. |
Source code in rivertext/models/iword2vec/unigram_table.py
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45 | def __init__(self, max_size: int = 100_000_000):
"""Initialize a Unigram Table instance.
Args:
max_size: Size of the unigram table, by default 100_000_000
Raises:
TypeError: The max size should be int number.
ValueError: The max size should be greater than 0.
"""
if not isinstance(max_size, int):
raise TypeError(f"max_size should be int, got {max_size}")
if max_size < 0:
raise ValueError(f"max_size should be greater than , got {max_size}")
self.max_size = max_size
self.size = 0
self.z = 0
self.table = np.zeros(self.max_size)
|
build(vocab, alpha)
Build a unigram table based on the vocabulary structure.
Parameters:
Name |
Type |
Description |
Default |
vocab |
Vocab
|
Vocabulary. |
required
|
alpha |
float
|
Smoothed parameter. |
required
|
Source code in rivertext/models/iword2vec/unigram_table.py
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98 | def build(self, vocab: Vocab, alpha: float) -> None:
"""Build a unigram table based on the vocabulary structure.
Args:
vocab: Vocabulary.
alpha: Smoothed parameter.
"""
reserved_idxs = set(vocab.counter.keys())
free_idxs = vocab.free_idxs
counts = vocab.counter.to_numpy(reserved_idxs | free_idxs)
vocab_size = len(counts)
counts_pow = np.power(counts, alpha)
z = np.sum(counts_pow)
nums = self.max_size * counts_pow / z
nums = np.vectorize(round_number)(nums)
sum_nums = np.sum(nums)
while self.max_size < sum_nums:
w = int(np.random.randint(0, vocab_size))
if 0 < nums[w]:
nums[w] -= 1
sum_nums -= 1
self.z = z
self.size = 0
for w in range(vocab_size):
self.table[self.size : self.size + nums[w]] = w
self.size += nums[w]
|
sample()
Obtain a negative sample from the unigram table.
Returns:
Type |
Description |
int
|
Index of negative sample obtained. |
Source code in rivertext/models/iword2vec/unigram_table.py
47
48
49
50
51
52
53
54
55 | def sample(self) -> int:
"""Obtain a negative sample from the unigram table.
Returns:
Index of negative sample obtained.
"""
assert 0 < self.size
unigram_idx = self.table[np.random.randint(0, self.size)]
return unigram_idx
|
samples(n)
Obtain n negative samples from the unigram table
Parameters:
Name |
Type |
Description |
Default |
n |
int
|
Number of negative samples. |
required
|
Returns:
Type |
Description |
np.ndarray
|
A array of negative samples. |
Source code in rivertext/models/iword2vec/unigram_table.py
57
58
59
60
61
62
63
64
65
66
67 | def samples(self, n: int) -> np.ndarray:
"""Obtain n negative samples from the unigram table
Args:
n: Number of negative samples.
Returns:
A array of negative samples.
"""
unigram_idxs = list(self.table[np.random.randint(0, self.size, size=n)])
return unigram_idxs
|
update(word_idx, F)
Update the unigram table acording to the new words in the text stream.
Parameters:
Name |
Type |
Description |
Default |
word_idx |
int
|
Index of the word to update in the unigram table. |
required
|
F |
float
|
Normalize value. |
required
|
Source code in rivertext/models/iword2vec/unigram_table.py
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125 | def update(self, word_idx: int, F: float) -> None:
"""Update the unigram table acording to the new words in the text stream.
Args:
word_idx: Index of the word to update in the unigram table.
F: Normalize value.
"""
assert 0 <= word_idx
assert 0.0 <= F
self.z += F
if self.size < self.max_size:
if F.is_integer():
copies = min(int(F), self.max_size)
self.table[self.size : self.size + copies] = word_idx
else:
copies = min(round_number(F), self.max_size)
self.table[self.size : self.size + copies] = word_idx
self.size += copies
else:
n = round_number((F / self.z) * self.max_size)
for _ in range(n):
table_idx = np.random.randint(0, self.max_size)
self.table[table_idx] = word_idx
|