-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenizer.py
63 lines (45 loc) · 1.86 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import re
import itertools
import numpy as np
class Tokenizer:
def __init__(self, num_tokens):
self.num_tokens = num_tokens
self.token2id = {}
self.token_hist = {}
self.id2token = []
self.vocab = []
def fit(self, text_data):
vocab = []
token_hist = {}
for text in text_data :
words = text.split()
for word in words:
if word not in vocab:
vocab.append(word)
if word not in token_hist:
token_hist [word] = 0
token_hist [word] = token_hist [word] + 1
token_hist_sorted = dict(sorted(token_hist.items(), key=lambda x: x[1], reverse=True))
self.token_hist = dict(itertools.islice(token_hist_sorted.items(), self.num_tokens-3))
self.vocab = [x for x in self.token_hist]
self.id2token = ["unk","__start__","__end__"] + [x for x in self.vocab]
ids = [ str(i) for i in range(0, len(self.id2token)) ]
self.token2id = dict(zip(self.id2token, ids))
def transform(self, text_data, text_len, padding=False, post=True, append_indicators=False):
vec_data = []
for text in text_data:
words = text.split()[:text_len]
vec = np.zeros(text_len, dtype=int)
count = 0
for j in range(0, len(words)):
if j < text_len-2:
if words[j] in self.id2token:
vec[j] = self.token2id[words[j]]
else:
vec[j] = 0
count = count + 1
if append_indicators:
vec = np.insert(vec, 0, 1, axis=0)
vec = np.insert(vec, count+1, 2, axis=0)
vec_data.append(vec)
return np.array(vec_data)