data

Module for adding new iterable datasets.

`TweetStream`

Bases: IterableDataset

An Iterable Dataset extends the Iterable class from the PyTorch package.

The Iterable Dataset class is designed to process big volumes of tweets that do not necessarily fit in memory.

The tweets are expected to be separated by a line break through the file on disk.

Examples:

>>> from rivertext.utils import TweetStream
>>> ts = TweetStream("/path/to/tweets.txt")
>>> dataloader = DataLoader(ts, batch_size=1)
>>> for batch in dataloader:
...     print(batch)
>>> "hello how are you?"
>>> "This is tweet example?"

Source code in rivertext/utils/data.py

class TweetStream(IterableDataset):
    """An Iterable Dataset extends the Iterable class from the PyTorch package.

    The Iterable Dataset class is designed to process big volumes of tweets that
        do not necessarily fit in memory.

    The tweets are expected to be separated by a line break through the file on disk.

    Examples:
        >>> from rivertext.utils import TweetStream
        >>> ts = TweetStream("/path/to/tweets.txt")
        >>> dataloader = DataLoader(ts, batch_size=1)
        >>> for batch in dataloader:
        ...     print(batch)
        >>> "hello how are you?"
        >>> "This is tweet example?"

    """

    def __init__(self, filename: str):
        """An instance of TweetStream class.

        Args:
            filename: path to the tweets file in the disk.


        """
        self.filename = filename

    def preprocess(self, text: str) -> List[str]:
        """Remove the whitespace for the current tweet.

        Args:
            text: tweet to remove whitespace.

        Returns:
            A String without whitespaces.
        """

        tweet = text.rstrip("\n")
        return tweet

    def __iter__(self) -> Iterator:
        """Take some tweets from the file on the disk, creating a generator.

        Examples:
            >>> from rivertext.utils import TweetStream
            >>> ts = TweetStream("/path/to/tweets.txt")
            >>> next(ts)
            >>> "hello how are you?"
            >>> next(ts)
            >>> "This is tweet example?"

        Yields:
            A generator of tweets.
        """
        file_itr = open(self.filename, encoding="utf-8")
        mapped_itr = map(self.preprocess, file_itr)
        return mapped_itr

`init(filename)`

An instance of TweetStream class.

Parameters:

Name	Type	Description	Default
`filename`	`str`	path to the tweets file in the disk.	required

Source code in rivertext/utils/data.py

def __init__(self, filename: str):
    """An instance of TweetStream class.

    Args:
        filename: path to the tweets file in the disk.


    """
    self.filename = filename

`iter()`

Take some tweets from the file on the disk, creating a generator.

Examples:

>>> from rivertext.utils import TweetStream
>>> ts = TweetStream("/path/to/tweets.txt")
>>> next(ts)
>>> "hello how are you?"
>>> next(ts)
>>> "This is tweet example?"

Yields:

Type	Description
`Iterator`	A generator of tweets.

Source code in rivertext/utils/data.py

def __iter__(self) -> Iterator:
    """Take some tweets from the file on the disk, creating a generator.

    Examples:
        >>> from rivertext.utils import TweetStream
        >>> ts = TweetStream("/path/to/tweets.txt")
        >>> next(ts)
        >>> "hello how are you?"
        >>> next(ts)
        >>> "This is tweet example?"

    Yields:
        A generator of tweets.
    """
    file_itr = open(self.filename, encoding="utf-8")
    mapped_itr = map(self.preprocess, file_itr)
    return mapped_itr

`preprocess(text)`

Remove the whitespace for the current tweet.

Parameters:

Name	Type	Description	Default
`text`	`str`	tweet to remove whitespace.	required

Returns:

Type	Description
`List[str]`	A String without whitespaces.

Source code in rivertext/utils/data.py

def preprocess(self, text: str) -> List[str]:
    """Remove the whitespace for the current tweet.

    Args:
        text: tweet to remove whitespace.

    Returns:
        A String without whitespaces.
    """

    tweet = text.rstrip("\n")
    return tweet

data

TweetStream

__init__(filename)

__iter__()

preprocess(text)

`TweetStream`

`init(filename)`

`iter()`

`preprocess(text)`