Source code for pyabsa.utils.text_utils.word2vec

# -*- coding: utf-8 -*-
# file:
# time: 02/11/2022 15:41
# author: YANG, HENG <> (杨恒)
# github:
# GScholar:
# ResearchGate:
# Copyright (C) 2022. All Rights Reserved.

import os
import time

from findfile import find_cwd_files
from transformers import AutoTokenizer

from pyabsa.utils.pyabsa_utils import fprint

# from gensim.models.word2vec import LineSentence

import os
import time
from typing import List
from transformers import AutoTokenizer

[docs] def train_word2vec( corpus_files: List[str] = None, # a list of file paths for the input corpus save_path: str = "word2vec", # the directory where the model and vectors will be saved vector_dim: int = 300, # the dimension of the resulting word vectors window: int = 5, # the size of the window used for context min_count: int = 1000, # the minimum count of a word for it to be included in the model skip_gram: int = 1, # whether to use skip-gram (1) or CBOW (0) algorithm num_workers: int = None, # the number of worker threads to use (default: CPU count - 1) epochs: int = 10, # the number of iterations over the corpus pre_tokenizer: str = None, # the name of a tokenizer to use for preprocessing (optional) **kwargs ): """ Train a Word2Vec model on a given corpus and save the resulting model and vectors to disk. Args: - corpus_files: a list of file paths for the input corpus - save_path: the directory where the model and vectors will be saved - vector_dim: the dimension of the resulting word vectors - window: the size of the window used for context - min_count: the minimum count of a word for it to be included in the model - skip_gram: whether to use skip-gram (1) or CBOW (0) algorithm - num_workers: the number of worker threads to use (default: CPU count - 1) - epochs: the number of iterations over the corpus - pre_tokenizer: the name of a tokenizer to use for preprocessing (optional) """ from gensim.models import Word2Vec if not os.path.exists(save_path): os.makedirs(save_path) in_corpus = [] if not corpus_files: # if corpus_files not specified, find all .txt files in the current working directory corpus_files = find_cwd_files(".txt", exclude_key=["word2vec", "ignore"]) elif isinstance(corpus_files, str): # if only one file path is specified, convert it to a list corpus_files = [corpus_files] else: # ensure that corpus_files is a list assert isinstance(corpus_files, list) # load the input corpus fprint("Start loading corpus files:", corpus_files) if isinstance(pre_tokenizer, str): pre_tokenizer = AutoTokenizer.from_pretrained(pre_tokenizer) for f in corpus_files: with open(f, "r", encoding="utf-8") as fin: for line in fin: if pre_tokenizer: res = pre_tokenizer.tokenize(line.strip()) else: res = line.strip().split() in_corpus.append(res) # train the Word2Vec model fprint("Start training word2vec model") start = time.time() model = Word2Vec( sentences=in_corpus, vector_size=vector_dim, window=window, min_count=min_count, sg=skip_gram, workers=num_workers if num_workers else os.cpu_count() - 1, epochs=epochs, **kwargs ) fprint("Time cost: ", time.time() - start) model.wv.save_word2vec_format( os.path.join(save_path, "word2vec768d.txt"), binary=False ) # 不以C语言可以解析的形式存储词向量, "w2v768d.model")) fprint("Word2vec training done ")
if __name__ == "__main__": os.environ["TOKENIZERS_PARALLELISM"] = "false"
[docs] tokenizer = AutoTokenizer.from_pretrained("rna_bpe_tokenizer")
paths = [] train_word2vec(paths, "word2vec", num_workers=12, pre_tokenizer=tokenizer)