Skip to content

base

Contains a base class for implement any incremental method in RiverText.

IWVBase

Bases: Transformer, VectorizerMixin

Base class for implement any incremental method in RiverText.

Source code in rivertext/models/base/iwv.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
class IWVBase(Transformer, VectorizerMixin):
    """Base class for implement any incremental method in RiverText."""

    def __init__(
        self,
        vocab_size: int,
        vector_size: int,
        window_size: int,
        on: str = None,
        strip_accents: bool = True,
        lowercase: bool = True,
        preprocessor=None,
        tokenizer: Callable[[str], List[str]] = None,
        ngram_range: Tuple[int, int] = (1, 1),
    ):
        """Base constructor for common hyperparameters.

        Args:
            vocab_size: The size of the vocabulary.
            vector_size: The dimension of the embedding.
            window_size: The size of the window.
            on:
                The name of the feature that contains the text to vectorize. If `None`,
                then each `learn_one` and `transform_one` should treat `x` as a `str`
                and not as a `dict`., by default None.
            strip_accents:
                Whether or not to strip accent characters, by default True.
            lowercase: Whether or not to convert all characters to lowercase
                by default True.
            preprocessor: An optional preprocessing function which overrides the
                `strip_accents` and `lowercase` steps, while preserving the tokenizing
                and n-grams generation steps., by default None
            tokenizer: A function used to convert preprocessed text into a `dict` of
                tokens. A default tokenizer is used if `None` is passed. Set to `False`
                to disable tokenization, by default None.
            ngram_range: The lower and upper boundary of the range n-grams to be
                extracted. All values of n such that `min_n <= n <= max_n` will be used.
                For example an `ngram_range` of `(1, 1)` means only unigrams, `(1, 2)`
                means unigrams and bigrams, and `(2, 2)` means only bigrams, by default
                (1, 1).
        """
        super().__init__(
            on=on,
            strip_accents=strip_accents,
            lowercase=lowercase,
            preprocessor=preprocessor,
            tokenizer=tokenizer,
            ngram_range=ngram_range,
        )

        self.vocab_size = vocab_size
        self.vector_size = vector_size
        self.window_size = window_size

    @abc.abstractmethod
    def learn_many(self, X: List[str], y=None, **kwargs) -> None:
        """Train a mini-batch of text features.

        Args:
            X: A list of sentence features.
            y: A series of target values, by default None.
        """
        raise NotImplementedError()

    @abc.abstractmethod
    def vocab2dict(self) -> Dict[str, np.ndarray]:
        """
        Abstract method for transforming the vocabulary into a dictionary. The keys are
        the words of the vocabulary, and the values are the training vectors.

        Returns:
            A dictionary of embeddings.

        """
        raise NotImplementedError()

__init__(vocab_size, vector_size, window_size, on=None, strip_accents=True, lowercase=True, preprocessor=None, tokenizer=None, ngram_range=(1, 1))

Base constructor for common hyperparameters.

Parameters:

Name Type Description Default
vocab_size int

The size of the vocabulary.

required
vector_size int

The dimension of the embedding.

required
window_size int

The size of the window.

required
on str

The name of the feature that contains the text to vectorize. If None, then each learn_one and transform_one should treat x as a str and not as a dict., by default None.

None
strip_accents bool

Whether or not to strip accent characters, by default True.

True
lowercase bool

Whether or not to convert all characters to lowercase by default True.

True
preprocessor

An optional preprocessing function which overrides the strip_accents and lowercase steps, while preserving the tokenizing and n-grams generation steps., by default None

None
tokenizer Callable[[str], List[str]]

A function used to convert preprocessed text into a dict of tokens. A default tokenizer is used if None is passed. Set to False to disable tokenization, by default None.

None
ngram_range Tuple[int, int]

The lower and upper boundary of the range n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used. For example an ngram_range of (1, 1) means only unigrams, (1, 2) means unigrams and bigrams, and (2, 2) means only bigrams, by default (1, 1).

(1, 1)
Source code in rivertext/models/base/iwv.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def __init__(
    self,
    vocab_size: int,
    vector_size: int,
    window_size: int,
    on: str = None,
    strip_accents: bool = True,
    lowercase: bool = True,
    preprocessor=None,
    tokenizer: Callable[[str], List[str]] = None,
    ngram_range: Tuple[int, int] = (1, 1),
):
    """Base constructor for common hyperparameters.

    Args:
        vocab_size: The size of the vocabulary.
        vector_size: The dimension of the embedding.
        window_size: The size of the window.
        on:
            The name of the feature that contains the text to vectorize. If `None`,
            then each `learn_one` and `transform_one` should treat `x` as a `str`
            and not as a `dict`., by default None.
        strip_accents:
            Whether or not to strip accent characters, by default True.
        lowercase: Whether or not to convert all characters to lowercase
            by default True.
        preprocessor: An optional preprocessing function which overrides the
            `strip_accents` and `lowercase` steps, while preserving the tokenizing
            and n-grams generation steps., by default None
        tokenizer: A function used to convert preprocessed text into a `dict` of
            tokens. A default tokenizer is used if `None` is passed. Set to `False`
            to disable tokenization, by default None.
        ngram_range: The lower and upper boundary of the range n-grams to be
            extracted. All values of n such that `min_n <= n <= max_n` will be used.
            For example an `ngram_range` of `(1, 1)` means only unigrams, `(1, 2)`
            means unigrams and bigrams, and `(2, 2)` means only bigrams, by default
            (1, 1).
    """
    super().__init__(
        on=on,
        strip_accents=strip_accents,
        lowercase=lowercase,
        preprocessor=preprocessor,
        tokenizer=tokenizer,
        ngram_range=ngram_range,
    )

    self.vocab_size = vocab_size
    self.vector_size = vector_size
    self.window_size = window_size

learn_many(X, y=None, **kwargs) abstractmethod

Train a mini-batch of text features.

Parameters:

Name Type Description Default
X List[str]

A list of sentence features.

required
y

A series of target values, by default None.

None
Source code in rivertext/models/base/iwv.py
64
65
66
67
68
69
70
71
72
@abc.abstractmethod
def learn_many(self, X: List[str], y=None, **kwargs) -> None:
    """Train a mini-batch of text features.

    Args:
        X: A list of sentence features.
        y: A series of target values, by default None.
    """
    raise NotImplementedError()

vocab2dict() abstractmethod

Abstract method for transforming the vocabulary into a dictionary. The keys are the words of the vocabulary, and the values are the training vectors.

Returns:

Type Description
Dict[str, np.ndarray]

A dictionary of embeddings.

Source code in rivertext/models/base/iwv.py
74
75
76
77
78
79
80
81
82
83
84
@abc.abstractmethod
def vocab2dict(self) -> Dict[str, np.ndarray]:
    """
    Abstract method for transforming the vocabulary into a dictionary. The keys are
    the words of the vocabulary, and the values are the training vectors.

    Returns:
        A dictionary of embeddings.

    """
    raise NotImplementedError()