Skip to content

data

Module for adding new iterable datasets.

TweetStream

Bases: IterableDataset

An Iterable Dataset extends the Iterable class from the PyTorch package.

The Iterable Dataset class is designed to process big volumes of tweets that do not necessarily fit in memory.

The tweets are expected to be separated by a line break through the file on disk.

Examples:

>>> from rivertext.utils import TweetStream
>>> ts = TweetStream("/path/to/tweets.txt")
>>> dataloader = DataLoader(ts, batch_size=1)
>>> for batch in dataloader:
...     print(batch)
>>> "hello how are you?"
>>> "This is tweet example?"
Source code in rivertext/utils/data.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
class TweetStream(IterableDataset):
    """An Iterable Dataset extends the Iterable class from the PyTorch package.

    The Iterable Dataset class is designed to process big volumes of tweets that
        do not necessarily fit in memory.

    The tweets are expected to be separated by a line break through the file on disk.

    Examples:
        >>> from rivertext.utils import TweetStream
        >>> ts = TweetStream("/path/to/tweets.txt")
        >>> dataloader = DataLoader(ts, batch_size=1)
        >>> for batch in dataloader:
        ...     print(batch)
        >>> "hello how are you?"
        >>> "This is tweet example?"

    """

    def __init__(self, filename: str):
        """An instance of TweetStream class.

        Args:
            filename: path to the tweets file in the disk.


        """
        self.filename = filename

    def preprocess(self, text: str) -> List[str]:
        """Remove the whitespace for the current tweet.

        Args:
            text: tweet to remove whitespace.

        Returns:
            A String without whitespaces.
        """

        tweet = text.rstrip("\n")
        return tweet

    def __iter__(self) -> Iterator:
        """Take some tweets from the file on the disk, creating a generator.

        Examples:
            >>> from rivertext.utils import TweetStream
            >>> ts = TweetStream("/path/to/tweets.txt")
            >>> next(ts)
            >>> "hello how are you?"
            >>> next(ts)
            >>> "This is tweet example?"

        Yields:
            A generator of tweets.
        """
        file_itr = open(self.filename, encoding="utf-8")
        mapped_itr = map(self.preprocess, file_itr)
        return mapped_itr

__init__(filename)

An instance of TweetStream class.

Parameters:

Name Type Description Default
filename str

path to the tweets file in the disk.

required
Source code in rivertext/utils/data.py
27
28
29
30
31
32
33
34
35
def __init__(self, filename: str):
    """An instance of TweetStream class.

    Args:
        filename: path to the tweets file in the disk.


    """
    self.filename = filename

__iter__()

Take some tweets from the file on the disk, creating a generator.

Examples:

>>> from rivertext.utils import TweetStream
>>> ts = TweetStream("/path/to/tweets.txt")
>>> next(ts)
>>> "hello how are you?"
>>> next(ts)
>>> "This is tweet example?"

Yields:

Type Description
Iterator

A generator of tweets.

Source code in rivertext/utils/data.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
def __iter__(self) -> Iterator:
    """Take some tweets from the file on the disk, creating a generator.

    Examples:
        >>> from rivertext.utils import TweetStream
        >>> ts = TweetStream("/path/to/tweets.txt")
        >>> next(ts)
        >>> "hello how are you?"
        >>> next(ts)
        >>> "This is tweet example?"

    Yields:
        A generator of tweets.
    """
    file_itr = open(self.filename, encoding="utf-8")
    mapped_itr = map(self.preprocess, file_itr)
    return mapped_itr

preprocess(text)

Remove the whitespace for the current tweet.

Parameters:

Name Type Description Default
text str

tweet to remove whitespace.

required

Returns:

Type Description
List[str]

A String without whitespaces.

Source code in rivertext/utils/data.py
37
38
39
40
41
42
43
44
45
46
47
48
def preprocess(self, text: str) -> List[str]:
    """Remove the whitespace for the current tweet.

    Args:
        text: tweet to remove whitespace.

    Returns:
        A String without whitespaces.
    """

    tweet = text.rstrip("\n")
    return tweet