Skip to content

vocab

Container class for saving vocabulary words

Context

Bases: Vocab

Container class for saving the contexts in the WCM model.

Source code in rivertext/utils/vocab.py
146
147
148
149
150
class Context(Vocab):
    """Container class for saving the contexts in the WCM model."""

    def __init__(self, max_size):
        super().__init__(max_size)

Vocab

The Container class efficiently stores the mapping of words to their corresponding vector representations and supports all necessary elements required by an architecture, such as lookup tables, counters, and space indexes. It is an essential tool for managing and accessing word embeddings.

References
  1. https://github.com/yahoojapan/yskip/blob/master/src/vocab.h
Source code in rivertext/utils/vocab.py
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
class Vocab:
    """The Container class efficiently stores the mapping of words to their
    corresponding vector representations and supports all necessary elements
    required by an architecture, such as lookup tables, counters, and space indexes.
    It is an essential tool for managing and accessing word embeddings.

    References:
        1. https://github.com/yahoojapan/yskip/blob/master/src/vocab.h
    """

    def __init__(self, max_size: int = 1_000_000):
        """Initialize a Vocab instance

        Args:
            max_size:
                The size of the Vocabulary, by default 1_000_000.

        Raises:
            TypeError: The max size should be int number.
            ValueError: The max size should be greater than 0.
        """

        if not isinstance(max_size, int):
            raise TypeError(f"max_size should be int, got {max_size}")
        if max_size < 0:
            raise ValueError(f"max_size should be greater than 0, got {max_size}")

        self.max_size = max_size
        self.size = 0

        self.word2idx = VectorDict()
        self.idx2word = VectorDict()

        self.free_idxs: Set[int] = set()

        self.counter = VectorDict()

        self.first_full = False

    def add(self, word: str) -> int:
        """Add a new word.

        When a new word is added to the Vocabulary class, multiple data structures are
        updated to maintain the accuracy of the lookup tables, word counts, and
        available index for new entries.

        Args:
            word: New word to add.

        Returns:
            Index mapped to the new word. If the max size is equal to the current size,
                the method returns -1.
        """
        if word not in self.word2idx.keys() and not self.is_full():
            if not self.first_full:
                word_idx = self.size
            else:
                word_idx = self.free_idxs.pop()
            self.word2idx[word] = word_idx
            self.idx2word[word_idx] = word
            self.counter[word_idx] = 1
            self.size += 1

            if self.is_full():
                self.first_full = True
            return word_idx

        elif word in self.word2idx.keys():
            word_idx = self.word2idx[word]
            self.counter[word_idx] += 1
            return word_idx

    def add_tokens(self, tokens: List[str]) -> None:
        """Add a list of new words.

        Args:
            tokens: List of words to add.
        """
        for token in tokens:
            self.add(token)

    def is_full(self) -> bool:
        """Check if the vocabulary is full.

        Returns:
            True if the vocabulary structure is full, otherwise False.
        """
        return self.size == self.max_size

    def __len__(self) -> int:
        """Obtain the number of words inside the vocabulary.

        Returns:
            Number of words inside the vocabulary.
        """
        return len(self.word2idx)

    def __contains__(self, word: str) -> bool:
        """Check if a word is in the vocabulary.

        Args:
            word: Word to check.

        Returns:
            True if the word is in the vocabulary structure, otherwise False.
        """
        return word in self.word2idx

    def __getitem__(self, word: str) -> int:
        """Obtain the index of a given the word. If the word is not in the vocabulary
        returns -1.

        Args:
            word:
                word to get the index value.

        Returns:
            The value of index if the word is in the vocabulary, otherwise -1.
        """
        if word in self.word2idx:
            word_idx = self.word2idx[word]
            return word_idx
        return -1

    def delete(self, idx: int) -> None:
        """Delete the word mapped to the index idx.

        Args:
            idx:
                Index of the word.
        """
        self.free_idxs.add(idx)
        word = self.idx2word[idx]
        del self.word2idx[word]
        del self.idx2word[idx]
        del self.counter[idx]
        self.size -= 1

__contains__(word)

Check if a word is in the vocabulary.

Parameters:

Name Type Description Default
word str

Word to check.

required

Returns:

Type Description
bool

True if the word is in the vocabulary structure, otherwise False.

Source code in rivertext/utils/vocab.py
104
105
106
107
108
109
110
111
112
113
def __contains__(self, word: str) -> bool:
    """Check if a word is in the vocabulary.

    Args:
        word: Word to check.

    Returns:
        True if the word is in the vocabulary structure, otherwise False.
    """
    return word in self.word2idx

__getitem__(word)

Obtain the index of a given the word. If the word is not in the vocabulary returns -1.

Parameters:

Name Type Description Default
word str

word to get the index value.

required

Returns:

Type Description
int

The value of index if the word is in the vocabulary, otherwise -1.

Source code in rivertext/utils/vocab.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def __getitem__(self, word: str) -> int:
    """Obtain the index of a given the word. If the word is not in the vocabulary
    returns -1.

    Args:
        word:
            word to get the index value.

    Returns:
        The value of index if the word is in the vocabulary, otherwise -1.
    """
    if word in self.word2idx:
        word_idx = self.word2idx[word]
        return word_idx
    return -1

__init__(max_size=1000000)

Initialize a Vocab instance

Parameters:

Name Type Description Default
max_size int

The size of the Vocabulary, by default 1_000_000.

1000000

Raises:

Type Description
TypeError

The max size should be int number.

ValueError

The max size should be greater than 0.

Source code in rivertext/utils/vocab.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
def __init__(self, max_size: int = 1_000_000):
    """Initialize a Vocab instance

    Args:
        max_size:
            The size of the Vocabulary, by default 1_000_000.

    Raises:
        TypeError: The max size should be int number.
        ValueError: The max size should be greater than 0.
    """

    if not isinstance(max_size, int):
        raise TypeError(f"max_size should be int, got {max_size}")
    if max_size < 0:
        raise ValueError(f"max_size should be greater than 0, got {max_size}")

    self.max_size = max_size
    self.size = 0

    self.word2idx = VectorDict()
    self.idx2word = VectorDict()

    self.free_idxs: Set[int] = set()

    self.counter = VectorDict()

    self.first_full = False

__len__()

Obtain the number of words inside the vocabulary.

Returns:

Type Description
int

Number of words inside the vocabulary.

Source code in rivertext/utils/vocab.py
 96
 97
 98
 99
100
101
102
def __len__(self) -> int:
    """Obtain the number of words inside the vocabulary.

    Returns:
        Number of words inside the vocabulary.
    """
    return len(self.word2idx)

add(word)

Add a new word.

When a new word is added to the Vocabulary class, multiple data structures are updated to maintain the accuracy of the lookup tables, word counts, and available index for new entries.

Parameters:

Name Type Description Default
word str

New word to add.

required

Returns:

Type Description
int

Index mapped to the new word. If the max size is equal to the current size, the method returns -1.

Source code in rivertext/utils/vocab.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
def add(self, word: str) -> int:
    """Add a new word.

    When a new word is added to the Vocabulary class, multiple data structures are
    updated to maintain the accuracy of the lookup tables, word counts, and
    available index for new entries.

    Args:
        word: New word to add.

    Returns:
        Index mapped to the new word. If the max size is equal to the current size,
            the method returns -1.
    """
    if word not in self.word2idx.keys() and not self.is_full():
        if not self.first_full:
            word_idx = self.size
        else:
            word_idx = self.free_idxs.pop()
        self.word2idx[word] = word_idx
        self.idx2word[word_idx] = word
        self.counter[word_idx] = 1
        self.size += 1

        if self.is_full():
            self.first_full = True
        return word_idx

    elif word in self.word2idx.keys():
        word_idx = self.word2idx[word]
        self.counter[word_idx] += 1
        return word_idx

add_tokens(tokens)

Add a list of new words.

Parameters:

Name Type Description Default
tokens List[str]

List of words to add.

required
Source code in rivertext/utils/vocab.py
79
80
81
82
83
84
85
86
def add_tokens(self, tokens: List[str]) -> None:
    """Add a list of new words.

    Args:
        tokens: List of words to add.
    """
    for token in tokens:
        self.add(token)

delete(idx)

Delete the word mapped to the index idx.

Parameters:

Name Type Description Default
idx int

Index of the word.

required
Source code in rivertext/utils/vocab.py
131
132
133
134
135
136
137
138
139
140
141
142
143
def delete(self, idx: int) -> None:
    """Delete the word mapped to the index idx.

    Args:
        idx:
            Index of the word.
    """
    self.free_idxs.add(idx)
    word = self.idx2word[idx]
    del self.word2idx[word]
    del self.idx2word[idx]
    del self.counter[idx]
    self.size -= 1

is_full()

Check if the vocabulary is full.

Returns:

Type Description
bool

True if the vocabulary structure is full, otherwise False.

Source code in rivertext/utils/vocab.py
88
89
90
91
92
93
94
def is_full(self) -> bool:
    """Check if the vocabulary is full.

    Returns:
        True if the vocabulary structure is full, otherwise False.
    """
    return self.size == self.max_size