Source code for pyabsa.augmentation.text_augment.tc_augment

# -*- coding: utf-8 -*-
# file: tc_augment.py
# time: 02/11/2022 19:51
# author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
# github: https://github.com/yangheng95
# GScholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
# ResearchGate: https://www.researchgate.net/profile/Heng-Yang-17/research
# Copyright (C) 2022. All Rights Reserved.
import os
import sys

from termcolor import colored

from pyabsa.utils.pyabsa_utils import fprint


[docs] class AugmentBackend:
[docs] EDA = "EDA"
[docs] ContextualWordEmbsAug = "ContextualWordEmbsAug"
[docs] RandomWordAug = "RandomWordAug"
[docs] AntonymAug = "AntonymAug"
[docs] SynonymAug = "SynonymAug"
[docs] SplitAug = "SplitAug"
[docs] BackTranslationAug = "BackTranslationAug"
[docs] SpellingAug = "SpellingAug"
[docs] def auto_classification_augmentation( config, dataset, device: str, boosting_fold: int = 4, classifier_training_num: int = 1, augment_num_per_case: int = 10, winner_num_per_case: int = 5, augment_backend: str = "eda", train_after_aug: bool = True, rewrite_cache: bool = True, ) -> None: """ Augment the dataset using BoostTextAugmentation tool (https://github.com/yangheng95/BoostTextAugmentation) for text classification. Args: config (ABSAConfig): The configuration object for ABSA. dataset (ABSADataset): The dataset to be augmented. device (str): The device to run the augment on. boosting_fold (int, optional): The number of boosting fold. Defaults to 4. classifier_training_num (int, optional): The number of classifier training. Defaults to 1. augment_num_per_case (int, optional): The number of augmented samples to generate per case. Defaults to 10. winner_num_per_case (int, optional): The number of winners per case. Defaults to 5. augment_backend (str, optional): The data augment backend to use. Defaults to "eda". train_after_aug (bool, optional): Whether to train the model after the data augmentation. Defaults to True. rewrite_cache (bool, optional): Whether to rewrite the cache files. Defaults to True. Returns: None """ fprint( colored( "Performing augmentation for text classification. This may take a long time", "yellow", ) ) from pyabsa.tasks.TextClassification import BERTTCModelList from boost_aug import TCBoostAug, AugmentBackend config.model = BERTTCModelList.BERT_MLP augmentor = TCBoostAug( ROOT=os.getcwd(), BOOSTING_FOLD=boosting_fold, CLASSIFIER_TRAINING_NUM=classifier_training_num, AUGMENT_NUM_PER_CASE=augment_num_per_case, WINNER_NUM_PER_CASE=winner_num_per_case, AUGMENT_BACKEND=augment_backend, device=device, ) augmentor.tc_boost_augment( config=config, dataset=dataset, train_after_aug=train_after_aug, rewrite_cache=rewrite_cache, )