Source code for pyabsa.framework.dataset_class.dataset_template

# -*- coding: utf-8 -*-
# file: dataset_template.py
# time: 02/11/2022 15:44
# author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
# github: https://github.com/yangheng95
# GScholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
# ResearchGate: https://www.researchgate.net/profile/Heng-Yang-17/research
# Copyright (C) 2022. All Rights Reserved.
import torch
from torch.utils.data import Dataset


[docs]class PyABSADataset(Dataset): """ Attributes data: a list of the loaded and preprocessed data samples. Methods __init__(self, config, tokenizer, dataset_type, **kwargs): constructs a new PyABSADataset object by loading and preprocessing a dataset based on the given configuration and dataset type. config is a configuration object containing the settings for loading and preprocessing the dataset, tokenizer is a pre-trained tokenizer object to tokenize the text data, and dataset_type is the type of the dataset to load (e.g., "train", "dev", "test"). Additional keyword arguments can be passed to customize the loading and preprocessing behavior. covert_to_tensor(data): a static method that converts the preprocessed data samples to PyTorch tensors. load_data_from_dict(self, dataset_dict, dataset_type, **kwargs): loads the dataset from a dictionary object containing the preprocessed data. dataset_dict is the dictionary object, dataset_type is the type of the dataset to load, and additional keyword arguments can be passed to customize the loading behavior. load_data_from_file(self, dataset_file, dataset_type, **kwargs): loads the dataset from a file containing the preprocessed data. dataset_file is the file path, dataset_type is the type of the dataset to load, and additional keyword arguments can be passed to customize the loading behavior. get_labels(self): returns a list of the labels for each data sample in the dataset. __len__(self): returns the number of data samples in the dataset. __str__(self): returns a string representation of the dataset. __repr__(self): returns a string representation of the dataset. """
[docs] data = []
def __init__(self, config, tokenizer, dataset_type, **kwargs): """ PyABSADataset is a PyTorch Dataset class used for loading datasets for aspect-based sentiment analysis tasks. :param config: A configuration dict containing various settings for the dataset and the model. :param tokenizer: A tokenizer used to tokenize the texts in the dataset. :param dataset_type: The type of the dataset, which can be "train", "dev", or "test". :param kwargs: Additional arguments for loading the dataset, such as "text_column", "aspect_column", "label_column", "separator", and "data_path". """ super(PyABSADataset, self).__init__() self.config = config self.tokenizer = tokenizer self.dataset_type = dataset_type if ( self.config.get("dataset_dict") and dataset_type in self.config.dataset_dict and self.config.dataset_dict[dataset_type] ): self.load_data_from_dict( config.dataset_dict, dataset_type=dataset_type, **kwargs ) self.data = self.covert_to_tensor(self.data) elif ( self.config.get("dataset_file") and dataset_type in self.config.dataset_file and self.config.dataset_file[dataset_type] ): self.load_data_from_file( self.config.dataset_file, dataset_type=dataset_type, **kwargs ) self.data = self.covert_to_tensor(self.data) self.data = self.data[ : self.config.get("data_num", None) if self.config.get("data_num", None) else None ] if self.config.get("verbose", True): self.config.logger.info( "{} data examples:\n {}".format(dataset_type, self.data[:2]) ) @staticmethod
[docs] def covert_to_tensor(data): """ Convert the data in the dataset to PyTorch tensors. :param data: A list of dictionaries, where each dictionary represents a data sample. :return: The data in the dataset as PyTorch tensors. """ for d in data: if isinstance(d, dict): for key, value in d.items(): try: d[key] = torch.tensor(value) except Exception as e: pass elif isinstance(d, list): for value in d: PyABSADataset.covert_to_tensor(value) PyABSADataset.covert_to_tensor(d) return data
[docs] def load_data_from_dict(self, dataset_dict, dataset_type, **kwargs): """ Load the dataset from a dictionary. :param dataset_dict: A dictionary containing the dataset. :param dataset_type: The type of the dataset, which can be "train", "dev", or "test". :param kwargs: Additional arguments for loading the dataset, such as "text_column", "aspect_column", "label_column", "separator", and "data_path". """ data = [] for text, aspect, label in zip( dataset_dict[dataset_type][kwargs["text_column"]], dataset_dict[dataset_type][kwargs["aspect_column"]], dataset_dict[dataset_type][kwargs["label_column"]], ): data.append( { "text": text, "aspect": aspect, "label": label, } ) self.data = data
[docs] def load_data_from_file(self, dataset_file, dataset_type, **kwargs): """ Load data from a file. :param dataset_file: The file to load data from. :param dataset_type: The type of dataset to load, e.g. "train", "test", "dev". :param kwargs: Optional additional arguments for loading data. """ if dataset_type in dataset_file: self.data = dataset_file[dataset_type]( self.config, self.tokenizer, **kwargs
)
[docs] def __getitem__(self, index): """ Get a data sample from the dataset at a specific index. :param index: The index of the data sample to retrieve. :return: A dictionary representing a data sample, with keys "text", "aspect", and "label". """ return self.data[index]
[docs] def get_labels(self): """ Get the labels of the data samples in the dataset. :return: A list of labels. """ return [data["label"] for data in self.data]
[docs] def __len__(self): """ Get the number of data samples in the dataset. :return: The number of data samples in the dataset. """ return len(self.data)
[docs] def __str__(self): """ Get a string representation of the dataset. :return: A string representing the dataset. """ return f"PyABASDataset: {len(self.data)} samples"
[docs] def __repr__(self): """ Get a string representation of the dataset. :return: A string representing the dataset. """ return self.__str__()