Skip to content

unigram_table

Incremental algorithm for extracting negative sampling from a text data stream.

UnigramTable

The algorithm updates incrementally a unigram table, which Kaji and Kobayashi proposed.

  1. While the table is incomplete, it is updated as the original unigram table algorithm.

  2. If the table is complete, a random number n is selected, and n copies from the word w are added to the array table.

References

1.Nobuhiro Kaji and Hayato Kobayashi. 2017. Incremental Skip-gram Model with Negative Sampling. In Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing, pages 363–371, Copenhagen, Denmark. Association for Computational Linguistics.

Source code in rivertext/models/iword2vec/unigram_table.py
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
class UnigramTable:
    """The algorithm updates incrementally a unigram table, which Kaji
    and Kobayashi proposed.

    1. While the table is incomplete, it is updated as the original unigram table
    algorithm.

    2. If the table is complete, a random number n is selected, and n copies from the
    word w are added to the array table.

    References:
        1.Nobuhiro Kaji and Hayato Kobayashi. 2017. Incremental Skip-gram Model
            with Negative Sampling. In Proceedings of the 2017 Conference on
            Empirical Methods in Natural Language Processing, pages 363–371,
            Copenhagen, Denmark. Association for Computational Linguistics.

    """

    def __init__(self, max_size: int = 100_000_000):
        """Initialize a Unigram Table instance.

        Args:
            max_size: Size of the unigram table, by default 100_000_000

        Raises:
            TypeError: The max size should be int number.
            ValueError: The max size should be greater than 0.
        """

        if not isinstance(max_size, int):
            raise TypeError(f"max_size should be int, got {max_size}")

        if max_size < 0:
            raise ValueError(f"max_size should be greater than , got {max_size}")

        self.max_size = max_size
        self.size = 0
        self.z = 0
        self.table = np.zeros(self.max_size)

    def sample(self) -> int:
        """Obtain a negative sample from the unigram table.

        Returns:
            Index of negative sample obtained.
        """
        assert 0 < self.size
        unigram_idx = self.table[np.random.randint(0, self.size)]
        return unigram_idx

    def samples(self, n: int) -> np.ndarray:
        """Obtain n negative samples from the unigram table

        Args:
            n: Number of negative samples.

        Returns:
            A array of negative samples.
        """
        unigram_idxs = list(self.table[np.random.randint(0, self.size, size=n)])
        return unigram_idxs

    def build(self, vocab: Vocab, alpha: float) -> None:
        """Build a unigram table based on the vocabulary structure.

        Args:
            vocab: Vocabulary.
            alpha: Smoothed parameter.
        """

        reserved_idxs = set(vocab.counter.keys())
        free_idxs = vocab.free_idxs
        counts = vocab.counter.to_numpy(reserved_idxs | free_idxs)
        vocab_size = len(counts)
        counts_pow = np.power(counts, alpha)
        z = np.sum(counts_pow)
        nums = self.max_size * counts_pow / z
        nums = np.vectorize(round_number)(nums)
        sum_nums = np.sum(nums)

        while self.max_size < sum_nums:
            w = int(np.random.randint(0, vocab_size))
            if 0 < nums[w]:
                nums[w] -= 1
                sum_nums -= 1

        self.z = z
        self.size = 0

        for w in range(vocab_size):
            self.table[self.size : self.size + nums[w]] = w
            self.size += nums[w]

    def update(self, word_idx: int, F: float) -> None:
        """Update the unigram table acording to the new words in the text stream.

        Args:
            word_idx: Index of the word to update in the unigram table.
            F: Normalize value.
        """

        assert 0 <= word_idx
        assert 0.0 <= F

        self.z += F
        if self.size < self.max_size:
            if F.is_integer():
                copies = min(int(F), self.max_size)
                self.table[self.size : self.size + copies] = word_idx
            else:
                copies = min(round_number(F), self.max_size)
                self.table[self.size : self.size + copies] = word_idx
            self.size += copies

        else:
            n = round_number((F / self.z) * self.max_size)
            for _ in range(n):
                table_idx = np.random.randint(0, self.max_size)
                self.table[table_idx] = word_idx

__init__(max_size=100000000)

Initialize a Unigram Table instance.

Parameters:

Name Type Description Default
max_size int

Size of the unigram table, by default 100_000_000

100000000

Raises:

Type Description
TypeError

The max size should be int number.

ValueError

The max size should be greater than 0.

Source code in rivertext/models/iword2vec/unigram_table.py
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def __init__(self, max_size: int = 100_000_000):
    """Initialize a Unigram Table instance.

    Args:
        max_size: Size of the unigram table, by default 100_000_000

    Raises:
        TypeError: The max size should be int number.
        ValueError: The max size should be greater than 0.
    """

    if not isinstance(max_size, int):
        raise TypeError(f"max_size should be int, got {max_size}")

    if max_size < 0:
        raise ValueError(f"max_size should be greater than , got {max_size}")

    self.max_size = max_size
    self.size = 0
    self.z = 0
    self.table = np.zeros(self.max_size)

build(vocab, alpha)

Build a unigram table based on the vocabulary structure.

Parameters:

Name Type Description Default
vocab Vocab

Vocabulary.

required
alpha float

Smoothed parameter.

required
Source code in rivertext/models/iword2vec/unigram_table.py
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
def build(self, vocab: Vocab, alpha: float) -> None:
    """Build a unigram table based on the vocabulary structure.

    Args:
        vocab: Vocabulary.
        alpha: Smoothed parameter.
    """

    reserved_idxs = set(vocab.counter.keys())
    free_idxs = vocab.free_idxs
    counts = vocab.counter.to_numpy(reserved_idxs | free_idxs)
    vocab_size = len(counts)
    counts_pow = np.power(counts, alpha)
    z = np.sum(counts_pow)
    nums = self.max_size * counts_pow / z
    nums = np.vectorize(round_number)(nums)
    sum_nums = np.sum(nums)

    while self.max_size < sum_nums:
        w = int(np.random.randint(0, vocab_size))
        if 0 < nums[w]:
            nums[w] -= 1
            sum_nums -= 1

    self.z = z
    self.size = 0

    for w in range(vocab_size):
        self.table[self.size : self.size + nums[w]] = w
        self.size += nums[w]

sample()

Obtain a negative sample from the unigram table.

Returns:

Type Description
int

Index of negative sample obtained.

Source code in rivertext/models/iword2vec/unigram_table.py
47
48
49
50
51
52
53
54
55
def sample(self) -> int:
    """Obtain a negative sample from the unigram table.

    Returns:
        Index of negative sample obtained.
    """
    assert 0 < self.size
    unigram_idx = self.table[np.random.randint(0, self.size)]
    return unigram_idx

samples(n)

Obtain n negative samples from the unigram table

Parameters:

Name Type Description Default
n int

Number of negative samples.

required

Returns:

Type Description
np.ndarray

A array of negative samples.

Source code in rivertext/models/iword2vec/unigram_table.py
57
58
59
60
61
62
63
64
65
66
67
def samples(self, n: int) -> np.ndarray:
    """Obtain n negative samples from the unigram table

    Args:
        n: Number of negative samples.

    Returns:
        A array of negative samples.
    """
    unigram_idxs = list(self.table[np.random.randint(0, self.size, size=n)])
    return unigram_idxs

update(word_idx, F)

Update the unigram table acording to the new words in the text stream.

Parameters:

Name Type Description Default
word_idx int

Index of the word to update in the unigram table.

required
F float

Normalize value.

required
Source code in rivertext/models/iword2vec/unigram_table.py
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def update(self, word_idx: int, F: float) -> None:
    """Update the unigram table acording to the new words in the text stream.

    Args:
        word_idx: Index of the word to update in the unigram table.
        F: Normalize value.
    """

    assert 0 <= word_idx
    assert 0.0 <= F

    self.z += F
    if self.size < self.max_size:
        if F.is_integer():
            copies = min(int(F), self.max_size)
            self.table[self.size : self.size + copies] = word_idx
        else:
            copies = min(round_number(F), self.max_size)
            self.table[self.size : self.size + copies] = word_idx
        self.size += copies

    else:
        n = round_number((F / self.z) * self.max_size)
        for _ in range(n):
            table_idx = np.random.randint(0, self.max_size)
            self.table[table_idx] = word_idx