Source code for pyabsa.tasks.ABSAInstruction.data_utils

# -*- coding: utf-8 -*-
# file: data_utils.py
# time: 15/03/2023
# author: yangheng <hy345@exeter.ac.uk>
# github: https://github.com/yangheng95
# huggingface: https://huggingface.co/yangheng
# google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
# Copyright (C) 2021. All Rights Reserved.

import json

import findfile
import pandas as pd
from datasets import DatasetDict, Dataset

from .instruction import (
    ATEInstruction,
    CategoryInstruction,
    OpinionInstruction,
    APCInstruction,
)


[docs]class InstructDatasetLoader:
    def __init__(
        self,
        train_df_id,
        test_df_id,
        train_df_ood=None,
        test_df_ood=None,
        sample_size=1,
    ):
        self.train_df_id = train_df_id.sample(frac=sample_size, random_state=1999)
        self.test_df_id = test_df_id
        if train_df_ood is not None:
            self.train_df_ood = train_df_ood.sample(frac=sample_size, random_state=1999)
        else:
            self.train_df_ood = train_df_ood
        self.test_df_ood = test_df_ood

[docs]    def prepare_instruction_dataloader(self, df):
        """
        Prepare the data in the input format required.
        """
        ate_instructor = ATEInstruction()
        apc_instructor = APCInstruction()
        op_instructor = OpinionInstruction()
        cat_instructor = CategoryInstruction()
        alldata = []
        for i, data in df.iterrows():
            _aspects = ["aspect:" + label["aspect"] for label in data["labels"]]
            aspects = []
            for asp in _aspects:
                if asp.strip() not in aspects:
                    aspects.append(asp.strip())
            aspects = "|".join(aspects)

            polarities = []
            _polarities = [
                "{}:{}".format(label["aspect"], label["polarity"])
                for label in data["labels"]
            ]
            for pol in _polarities:
                if pol not in polarities:
                    polarities.append(pol)
            polarities = "|".join(polarities)

            opinions = "|".join(
                [
                    "{}:{}".format(label["aspect"], label["opinion"])
                    for label in data["labels"]
                ]
            )

            categories = "|".join(
                [
                    "{}:{}".format(label["aspect"], label["category"])
                    for label in data["labels"]
                ]
            )

            # ATE task
            alldata.append(
                {"text": ate_instructor.prepare_input(data["text"]), "labels": aspects}
            )

            # APC task
            alldata.append(
                {
                    "text": apc_instructor.prepare_input(data["text"], aspects),
                    "labels": polarities,
                }
            )

            # Opinion task
            alldata.append(
                {
                    "text": op_instructor.prepare_input(data["text"], aspects),
                    "labels": opinions,
                }
            )

            # Category task
            if "NULL" not in categories:
                alldata.append(
                    {
                        "text": cat_instructor.prepare_input(data["text"], aspects),
                        "labels": categories,
                    }
                )

        alldata = pd.DataFrame(alldata)
        return alldata

[docs]    def create_datasets(self, tokenize_function):
        """
        Create the training and test dataset as huggingface datasets format.
        """
        # Define train and test sets
        if self.test_df_id is None:
            indomain_dataset = DatasetDict(
                {"train": Dataset.from_pandas(self.train_df_id)}
            )
        else:
            indomain_dataset = DatasetDict(
                {
                    "train": Dataset.from_pandas(self.train_df_id),
                    "test": Dataset.from_pandas(self.test_df_id),
                }
            )
        indomain_tokenized_datasets = indomain_dataset.map(
            tokenize_function, batched=True
        )

        if (self.train_df_ood is not None) and (self.test_df_ood is None):
            other_domain_dataset = DatasetDict(
                {"train": Dataset.from_pandas(self.train_df_id)}
            )
            other_domain_tokenized_dataset = other_domain_dataset.map(
                tokenize_function, batched=True
            )
        elif (self.train_df_ood is None) and (self.test_df_ood is not None):
            other_domain_dataset = DatasetDict(
                {"test": Dataset.from_pandas(self.train_df_id)}
            )
            other_domain_tokenized_dataset = other_domain_dataset.map(
                tokenize_function, batched=True
            )
        elif (self.train_df_ood is not None) and (self.test_df_ood is not None):
            other_domain_dataset = DatasetDict(
                {
                    "train": Dataset.from_pandas(self.train_df_ood),
                    "test": Dataset.from_pandas(self.test_df_ood),
                }
            )
            other_domain_tokenized_dataset = other_domain_dataset.map(
                tokenize_function, batched=True
            )
        else:
            other_domain_dataset = None
            other_domain_tokenized_dataset = None

        return (
            indomain_dataset,
            indomain_tokenized_datasets,
            other_domain_dataset,
            other_domain_tokenized_dataset,
        )


[docs]def read_json(data_path, data_type="train"):
    data = []

    files = findfile.find_files(data_path, [data_type, ".jsonl"], exclude_key=[".txt"])
    for f in files:
        print(f)
        with open(f, "r", encoding="utf8") as fin:
            for line in fin:
                data.append(json.loads(line))
    return data