-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathpretrain_embedding.py
118 lines (97 loc) · 3.85 KB
/
pretrain_embedding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import pkuseg
import re
import string
from zhon.hanzi import punctuation
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import multiprocessing
from gensim.models import word2vec
from gensim.models.word2vec import PathLineSentences
import logging
import os
import sys
from tqdm import tqdm
raw_file = './data/raw/text.txt'
clean_file = './data/raw/clean_text.txt'
seg_file = './data/raw/seg_text.txt'
def stat(seq_length, type):
print('Seq len info :')
seq_len = np.asarray(seq_length)
idx = np.arange(0, len(seq_len), dtype=np.int32)
print(stats.describe(seq_len))
plt.figure(figsize=(16, 9))
plt.subplot(121)
plt.plot(idx[:], seq_len[:], 'ro')
plt.grid(True)
plt.xlabel('index')
plt.ylabel('seq_len')
plt.title('Scatter Plot')
plt.subplot(122)
plt.hist(seq_len, bins=10, label=['seq_len'])
plt.grid(True)
plt.xlabel('seq_len')
plt.ylabel('freq')
plt.title('Histogram')
plt.savefig(type + '_len_stats.jpg', format='jpg')
def clean_func(line):
return re.sub(r"[%s]+" % string.punctuation, "", re.sub(r"[%s]+" % punctuation, "", line))
def clean_txt(in_path, seg_path, out_path, is_clean=True, is_seg=True):
logger = logging.getLogger('Cleaning')
logger.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
if is_clean:
logger.info('Loading raw text')
with open(in_path, 'r', encoding='utf8') as fin:
raw_lines = fin.readlines()
fin.close()
logger.info('Removing punctuations')
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
results = []
for i, line in enumerate(raw_lines):
results.append(pool.apply_async(clean_func, (line,)))
pool.close()
pool.join()
clean_lines = [res.get() for res in results]
with open(out_path, 'w', encoding='utf8') as fout:
for line in clean_lines:
fout.writelines(line)
fout.close()
if is_seg:
logger.info('Segmenting clean text')
# seg = pkuseg.pkuseg()
# seg_lines = [seg.cut(line) for line in raw_lines]
pkuseg.test(out_path, seg_path, nthread=multiprocessing.cpu_count())
logger.info('Loading segmented text')
with open(seg_path, 'r', encoding='utf8') as fin:
seg_lines = fin.readlines()
fin.close()
seg_lines = [line.strip().split() for line in seg_lines]
seg_len = [len(line) for line in seg_lines]
logger.info('Rows of segmented text - {}'.format(len(seg_lines)))
stat(seg_len, 'segmented')
clean_lines = filter(lambda x: len(x) > 2, seg_lines)
clean_lines = list(clean_lines)
clean_len = [len(line) for line in clean_lines]
logger.info('Rows of filtered text - {}'.format(len(clean_lines)))
stat(clean_len, 'filtered')
with open(seg_path, 'w', encoding='utf8') as fout:
for line in clean_lines:
fout.writelines(' '.join(line) + '\n')
fout.close()
print('hello world')
if __name__ == '__main__':
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
# clean_txt(raw_file, seg_file, clean_file)
model = word2vec.Word2Vec(PathLineSentences(seg_file), sg=1, size=300, window=5, min_count=10, sample=1e-4,
workers=multiprocessing.cpu_count())
model.wv.save_word2vec_format('./data/processed/word2vec.txt', binary=False)