pretrain_embedding.py

import pkuseg
import re
import string
from zhon.hanzi import punctuation
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import multiprocessing
from gensim.models import word2vec
from gensim.models.word2vec import PathLineSentences
import logging
import os
import sys
from tqdm import tqdm


raw_file = './data/raw/text.txt'
clean_file = './data/raw/clean_text.txt'
seg_file = './data/raw/seg_text.txt'


def stat(seq_length, type):
    print('Seq len info :')
    seq_len = np.asarray(seq_length)
    idx = np.arange(0, len(seq_len), dtype=np.int32)
    print(stats.describe(seq_len))
    plt.figure(figsize=(16, 9))
    plt.subplot(121)
    plt.plot(idx[:], seq_len[:], 'ro')
    plt.grid(True)
    plt.xlabel('index')
    plt.ylabel('seq_len')
    plt.title('Scatter Plot')

    plt.subplot(122)
    plt.hist(seq_len, bins=10, label=['seq_len'])
    plt.grid(True)
    plt.xlabel('seq_len')
    plt.ylabel('freq')
    plt.title('Histogram')
    plt.savefig(type + '_len_stats.jpg', format='jpg')


def clean_func(line):
    return re.sub(r"[%s]+" % string.punctuation, "", re.sub(r"[%s]+" % punctuation, "", line))


def clean_txt(in_path, seg_path, out_path, is_clean=True, is_seg=True):
    logger = logging.getLogger('Cleaning')
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.INFO)
    console_handler.setFormatter(formatter)
    logger.addHandler(console_handler)

    if is_clean:
        logger.info('Loading raw text')
        with open(in_path, 'r', encoding='utf8') as fin:
            raw_lines = fin.readlines()
        fin.close()

        logger.info('Removing punctuations')
        pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
        results = []
        for i, line in enumerate(raw_lines):
            results.append(pool.apply_async(clean_func, (line,)))
        pool.close()
        pool.join()
        clean_lines = [res.get() for res in results]
        with open(out_path, 'w', encoding='utf8') as fout:
            for line in clean_lines:
                fout.writelines(line)
        fout.close()

    if is_seg:
        logger.info('Segmenting clean text')
        # seg = pkuseg.pkuseg()
        # seg_lines = [seg.cut(line) for line in raw_lines]

        pkuseg.test(out_path, seg_path, nthread=multiprocessing.cpu_count())
        logger.info('Loading segmented text')
        with open(seg_path, 'r', encoding='utf8') as fin:
            seg_lines = fin.readlines()
        fin.close()

        seg_lines = [line.strip().split() for line in seg_lines]
        seg_len = [len(line) for line in seg_lines]
        logger.info('Rows of segmented text - {}'.format(len(seg_lines)))
        stat(seg_len, 'segmented')

        clean_lines = filter(lambda x: len(x) > 2, seg_lines)
        clean_lines = list(clean_lines)
        clean_len = [len(line) for line in clean_lines]
        logger.info('Rows of filtered text - {}'.format(len(clean_lines)))
        stat(clean_len, 'filtered')

        with open(seg_path, 'w', encoding='utf8') as fout:
            for line in clean_lines:
                fout.writelines(' '.join(line) + '\n')
        fout.close()

    print('hello world')


if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))
    # clean_txt(raw_file, seg_file, clean_file)

    model = word2vec.Word2Vec(PathLineSentences(seg_file), sg=1, size=300, window=5, min_count=10, sample=1e-4,
                              workers=multiprocessing.cpu_count())
    model.wv.save_word2vec_format('./data/processed/word2vec.txt', binary=False)