lastfm

`logger = logging.getLogger(name)` `module-attribute` ¶

`LastFMDataset` ¶

Bases: Dataset

Last FM dataset.

The Last FM dataset contains user interactions with artists. The tags in this datasets are not used in this implementation. The dataset that will be used would the the user_taggedartists-timestamps.dat file. The dataset contains the following columns: [user, artist, tags, timestamp].

The dataset is downloaded from the GroupLens website :cite:Cantador_RecSys2011.

Source code in src/recnexteval/datasets/datasets/lastfm.py

class LastFMDataset(Dataset):
    """
    Last FM dataset.

    The Last FM dataset contains user interactions with artists. The tags in this
    datasets are not used in this implementation. The dataset that will be used
    would the the user_taggedartists-timestamps.dat file. The dataset contains
    the following columns: [user, artist, tags, timestamp].

    The dataset is downloaded from the GroupLens website :cite:`Cantador_RecSys2011`.
    """
    IS_BASE: bool = False

    config: ClassVar[LastFMDatasetConfig] = LastFMDatasetConfig()

    ITEM_METADATA = None
    USER_METADATA = None
    TAG_METADATA = None

    def fetch_dataset(self) -> None:
        """Check if dataset is present, if not download.

        This method overrides the base class to handle the special case where
        the zipfile may exist but the extracted file doesn't.
        """
        zip_path = os.path.join(
            self.config.default_base_path, f"{self.config.remote_zipname}.zip"
        )

        if not os.path.exists(zip_path):
            logger.debug(f"{self.name} dataset zipfile not found in {zip_path}.")
            self._download_dataset()
        elif not os.path.exists(self.file_path):
            logger.debug(
                f"{self.name} dataset file not found, but zipfile already downloaded. "
                f"Extracting file from zipfile."
            )
            with zipfile.ZipFile(zip_path, "r") as zip_ref:
                zip_ref.extract(self.config.remote_filename, self.config.default_base_path)
        else:
            logger.debug("Data zipfile is in memory and in dir specified.")

    def _download_dataset(self) -> None:
        """Downloads the dataset.

        Downloads the zipfile, and extracts the interaction file to `self.file_path`
        """
        zip_path = os.path.join(
            self.config.default_base_path, f"{self.config.remote_zipname}.zip"
        )

        logger.debug(f"Downloading {self.name} dataset from {self.config.dataset_url}")

        # Download the zip into the data directory
        self._fetch_remote(
            f"{self.config.dataset_url}/{self.config.remote_zipname}.zip",
            zip_path,
        )

        # Extract the interaction file which we will use
        logger.debug(f"Extracting {self.config.remote_filename} from zip")
        with zipfile.ZipFile(zip_path, "r") as zip_ref:
            zip_ref.extract(self.config.remote_filename, self.config.default_base_path)

    def _load_dataframe(self) -> pd.DataFrame:
        """Load the raw dataset from file, and return it as a pandas DataFrame.

        Transform the dataset downloaded to have integer user and item ids. This
        will be needed for representation in the interaction matrix.

        Returns:
            The interaction data as a DataFrame with a row per interaction.
        """
        self.fetch_dataset()
        df = pd.read_csv(
            self.file_path,
            dtype={
                self.config.item_ix: np.int32,
                self.config.user_ix: np.int32,
                self.config.tag_ix: np.int32,
                self.config.timestamp_ix: np.int64,
            },
            sep="\t",
            names=[
                self.config.user_ix,
                self.config.item_ix,
                self.config.tag_ix,
                self.config.timestamp_ix,
            ],
            header=0,
        )
        # Convert from milliseconds to seconds
        df[self.config.timestamp_ix] = df[self.config.timestamp_ix] // 1_000

        logger.debug(f"Loaded {len(df)} interactions")
        return df

    def _fetch_dataset_metadata(
        self, user_id_mapping: pd.DataFrame, item_id_mapping: pd.DataFrame
    ) -> None:
        self.USER_METADATA = LastFMUserMetadata(user_id_mapping=user_id_mapping).load()
        self.ITEM_METADATA = LastFMItemMetadata(item_id_mapping=item_id_mapping).load()
        self.TAG_METADATA = LastFMTagMetadata().load()

`IS_BASE = False` `class-attribute` `instance-attribute` ¶

`config = LastFMDatasetConfig()` `class-attribute` ¶

`ITEM_METADATA = None` `class-attribute` `instance-attribute` ¶

`USER_METADATA = None` `class-attribute` `instance-attribute` ¶

`TAG_METADATA = None` `class-attribute` `instance-attribute` ¶

`name` `property` ¶

Name of the object's class.

:return: Name of the object's class :rtype: str

`file_path` `property` ¶

File path of the dataset.

`processed_cache_path` `property` ¶

Path for cached processed data.

`fetch_metadata = fetch_metadata` `instance-attribute` ¶

`preprocessor = DataFramePreprocessor(item_ix=(self.config.item_ix), user_ix=(self.config.user_ix), timestamp_ix=(self.config.timestamp_ix))` `instance-attribute` ¶

`timestamp_min` `property` ¶

Minimum timestamp in the dataset.

Returns:

Type	Description
`int`	Minimum timestamp in the dataset.

`timestamp_max` `property` ¶

Maximum timestamp in the dataset.

Returns:

Type	Description
`int`	Maximum timestamp in the dataset.

`fetch_dataset()` ¶

Check if dataset is present, if not download.

This method overrides the base class to handle the special case where the zipfile may exist but the extracted file doesn't.

Source code in src/recnexteval/datasets/datasets/lastfm.py

def fetch_dataset(self) -> None:
    """Check if dataset is present, if not download.

    This method overrides the base class to handle the special case where
    the zipfile may exist but the extracted file doesn't.
    """
    zip_path = os.path.join(
        self.config.default_base_path, f"{self.config.remote_zipname}.zip"
    )

    if not os.path.exists(zip_path):
        logger.debug(f"{self.name} dataset zipfile not found in {zip_path}.")
        self._download_dataset()
    elif not os.path.exists(self.file_path):
        logger.debug(
            f"{self.name} dataset file not found, but zipfile already downloaded. "
            f"Extracting file from zipfile."
        )
        with zipfile.ZipFile(zip_path, "r") as zip_ref:
            zip_ref.extract(self.config.remote_filename, self.config.default_base_path)
    else:
        logger.debug("Data zipfile is in memory and in dir specified.")

`fetch_dataset_force()` ¶

Force re-download of the dataset.

Source code in src/recnexteval/datasets/base.py

def fetch_dataset_force(self) -> None:
    """Force re-download of the dataset."""
    logger.debug(f"{self.name} force re-download of dataset.")
    self._download_dataset()

`get_timestamp_range_in_epoch()` ¶

Get the minimum and maximum timestamps in the dataset.

Returns:

Type	Description
`tuple[int, int]`	A tuple of (min_timestamp, max_timestamp).

Raises:

Type	Description
`RuntimeError`	If load() has not been called yet.

Source code in src/recnexteval/datasets/datasets/base.py

def get_timestamp_range_in_epoch(self) -> tuple[int, int]:
    """Get the minimum and maximum timestamps in the dataset.

    Returns:
        A tuple of (min_timestamp, max_timestamp).

    Raises:
        RuntimeError: If load() has not been called yet.
    """
    return self.timestamp_min, self.timestamp_max

`get_timestamp_range_in_datetime()` ¶

Get the minimum and maximum timestamps in the dataset.

Returns:

Type	Description
`tuple[datetime, datetime]`	A tuple of (min_timestamp, max_timestamp).

Raises:

Type	Description
`RuntimeError`	If load() has not been called yet.

Source code in src/recnexteval/datasets/datasets/base.py

def get_timestamp_range_in_datetime(self) -> tuple[datetime, datetime]:
    """Get the minimum and maximum timestamps in the dataset.

    Returns:
        A tuple of (min_timestamp, max_timestamp).

    Raises:
        RuntimeError: If load() has not been called yet.
    """
    min_dt = datetime.fromtimestamp(self.timestamp_min)
    max_dt = datetime.fromtimestamp(self.timestamp_max)
    return min_dt, max_dt

`add_filter(filter_)` ¶

Add a filter to be applied when loading the data.

Utilize :class:DataFramePreprocessor class to add filters to the dataset to load. The filter will be applied when the data is loaded into an :class:InteractionMatrix object when :meth:load is called.

:param filter_: Filter to be applied to the loaded DataFrame processing to interaction matrix. :type filter_: Filter

Source code in src/recnexteval/datasets/datasets/base.py

def add_filter(self, filter_: Filter) -> None:
    """Add a filter to be applied when loading the data.

    Utilize :class:`DataFramePreprocessor` class to add filters to the
    dataset to load. The filter will be applied when the data is loaded into
    an :class:`InteractionMatrix` object when :meth:`load` is called.

    :param filter_: Filter to be applied to the loaded DataFrame
                processing to interaction matrix.
    :type filter_: Filter
    """
    self.preprocessor.add_filter(filter_)

`load(apply_filters=True, use_cache=True)` ¶

Loads data into an InteractionMatrix object.

Data is loaded into a DataFrame using the :func:_load_dataframe function. Resulting DataFrame is parsed into an :class:InteractionMatrix object. If :data:apply_filters is set to True, the filters set will be applied to the dataset and mapping of user and item ids will be done. This is advised even if there is no filter set, as it will ensure that the user and item ids are incrementing in the order of time.

Parameters:

Name	Type	Description	Default
`apply_filters`	`bool`	To apply the filters set and preprocessing, defaults to True	`True`
`use_cache`	`bool`	Whether to use cached processed data, defaults to True	`True`

Returns:

Type	Description
`InteractionMatrix`	Resulting interaction matrix.

Source code in src/recnexteval/datasets/datasets/base.py

def load(self, apply_filters: bool = True, use_cache: bool = True) -> InteractionMatrix:
    """Loads data into an InteractionMatrix object.

    Data is loaded into a DataFrame using the :func:`_load_dataframe` function.
    Resulting DataFrame is parsed into an :class:`InteractionMatrix` object. If
    :data:`apply_filters` is set to True, the filters set will be applied to the
    dataset and mapping of user and item ids will be done. This is advised
    even if there is no filter set, as it will ensure that the user and item
    ids are incrementing in the order of time.

    Args:
        apply_filters: To apply the filters set and preprocessing,
            defaults to True
        use_cache: Whether to use cached processed data, defaults to True

    Returns:
        Resulting interaction matrix.
    """
    logger.info(f"{self.name} is loading dataset...")
    start = time.time()
    try:
        df = self._load_dataframe_from_cache() if use_cache else self._load_dataframe()
    except FileNotFoundError:
        logger.warning("Processed cache not found, loading raw dataframe.")
        df = self._load_dataframe()
        self._cache_processed_dataframe(df)
    if apply_filters:
        logger.debug(f"{self.name} applying filters set.")
        im = self.preprocessor.process(df)
    else:
        im = self._dataframe_to_matrix(df)
        logger.warning(
            "No filters applied, user and item ids may not be incrementing in the order of time. "
            "Classes that use this dataset may not work as expected."
        )
    self._timestamp_min = int(df[self.config.timestamp_ix].min())
    self._timestamp_max = int(df[self.config.timestamp_ix].max())

    if self.fetch_metadata:
        user_id_mapping, item_id_mapping = (
            self.preprocessor.user_id_mapping,
            self.preprocessor.item_id_mapping,
        )
        self._fetch_dataset_metadata(user_id_mapping=user_id_mapping, item_id_mapping=item_id_mapping)

    end = time.time()
    logger.info(f"{self.name} dataset loaded - Took {end - start:.3}s")
    return im

lastfm

logger = logging.getLogger(__name__) module-attribute ¶

LastFMDataset ¶

IS_BASE = False class-attribute instance-attribute ¶

config = LastFMDatasetConfig() class-attribute ¶

ITEM_METADATA = None class-attribute instance-attribute ¶

USER_METADATA = None class-attribute instance-attribute ¶

TAG_METADATA = None class-attribute instance-attribute ¶

name property ¶

file_path property ¶

processed_cache_path property ¶

fetch_metadata = fetch_metadata instance-attribute ¶

preprocessor = DataFramePreprocessor(item_ix=(self.config.item_ix), user_ix=(self.config.user_ix), timestamp_ix=(self.config.timestamp_ix)) instance-attribute ¶

timestamp_min property ¶

timestamp_max property ¶

fetch_dataset() ¶

fetch_dataset_force() ¶

get_timestamp_range_in_epoch() ¶

get_timestamp_range_in_datetime() ¶

add_filter(filter_) ¶

load(apply_filters=True, use_cache=True) ¶

`logger = logging.getLogger(name)` `module-attribute` ¶

`LastFMDataset` ¶

`IS_BASE = False` `class-attribute` `instance-attribute` ¶

`config = LastFMDatasetConfig()` `class-attribute` ¶

`ITEM_METADATA = None` `class-attribute` `instance-attribute` ¶

`USER_METADATA = None` `class-attribute` `instance-attribute` ¶

`TAG_METADATA = None` `class-attribute` `instance-attribute` ¶

`name` `property` ¶

`file_path` `property` ¶

`processed_cache_path` `property` ¶

`fetch_metadata = fetch_metadata` `instance-attribute` ¶

`preprocessor = DataFramePreprocessor(item_ix=(self.config.item_ix), user_ix=(self.config.user_ix), timestamp_ix=(self.config.timestamp_ix))` `instance-attribute` ¶

`timestamp_min` `property` ¶

`timestamp_max` `property` ¶

`fetch_dataset()` ¶

`fetch_dataset_force()` ¶

`get_timestamp_range_in_epoch()` ¶

`get_timestamp_range_in_datetime()` ¶

`add_filter(filter_)` ¶

`load(apply_filters=True, use_cache=True)` ¶