-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutil_trans.py
154 lines (135 loc) · 6.15 KB
/
util_trans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import re
import execjs
import urllib.request
import urllib.parse
import urllib.error
import json
import time
class TkGenerator:
"""
Compute the "TK" of the string.
TK is a str generated by js, and you should post the string and the corresponding TK when you translate.
Just like the hash of a string.
"""
def __init__(self):
self.ctx = execjs.compile("""
function TL(a) {
var k = "";
var b = 406644;
var b1 = 3293161072;
var jd = ".";
var $b = "+-a^+6";
var Zb = "+-3^+b+-f";
for (var e = [], f = 0, g = 0; g < a.length; g++) {
var m = a.charCodeAt(g);
128 > m ? e[f++] = m : (2048 > m ? e[f++] = m >> 6 | 192 : (55296 == (m & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (m = 65536 + ((m & 1023) << 10) + (a.charCodeAt(++g) & 1023),
e[f++] = m >> 18 | 240,
e[f++] = m >> 12 & 63 | 128) : e[f++] = m >> 12 | 224,
e[f++] = m >> 6 & 63 | 128),
e[f++] = m & 63 | 128)
}
a = b;
for (f = 0; f < e.length; f++) a += e[f],
a = RL(a, $b);
a = RL(a, Zb);
a ^= b1 || 0;
0 > a && (a = (a & 2147483647) + 2147483648);
a %= 1E6;
return a.toString() + jd + (a ^ b)
};
function RL(a, b) {
var t = "a";
var Yb = "+";
for (var c = 0; c < b.length - 2; c += 3) {
var d = b.charAt(c + 2),
d = d >= t ? d.charCodeAt(0) - 87 : Number(d),
d = b.charAt(c + 1) == Yb ? a >>> d: a << d;
a = b.charAt(c) == Yb ? a + d & 4294967295 : a ^ d
}
return a
}
""")
def get_tk(self, text: str) -> str:
return self.ctx.call("TL", text)
class Translator:
def __init__(self):
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
self.tk_gen = TkGenerator()
self.pattern = re.compile(r'\["(.*?)(?:\\n)')
self.max_limited = 3500
def __post(self, url, text):
post_data = {
'q': text
}
data = urllib.parse.urlencode(post_data).encode(encoding='utf-8')
request = urllib.request.Request(url=url, data=data, headers=self.headers)
response = urllib.request.urlopen(request)
return response.read().decode('utf-8')
def __translate(self, text, src_lang, target_lang) -> str:
tk = self.tk_gen.get_tk(text)
url = "http://translate.google.cn/translate_a/single?client=t" \
"&sl=%s&tl=%s&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \
"&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \
"&srcrom=0&ssel=0&tsel=0&kc=1&tk=%s" % (src_lang, target_lang, tk)
result = self.__post(url, text)
return result
def translate_raw(self, text: str, src_lang: str, target_lang: str) -> str:
"""
Similar with the method "translate", but this return more information.
:param text: Origin text
:param src_lang: source language. the ISO-639-1 language code of the input text
:param target_lang: target language. the ISO-639-1 language code of the output text
:return: raw respond string
"""
return self.__translate(text, src_lang, target_lang)
def translate(self, text: str, src_lang: str, target_lang: str) -> str:
"""
Execute translate.
Afrikaans af Albanian sq Amharic am Arabic ar Armenian hy Azerbaijani az
Basque eu Belarusian be Bengali bn Bosnian bs Bulgarian bg Catalan ca
Cebuano ceb Chinese(Simplified) zh-CN Chinese (Traditional) zh-TW
Corsican co Croatian hr Czech cs Danish da Dutch nl English en
Esperanto eo Estonian et Finnish fi French fr Frisian fy Galician gl
Georgian ka German de Greek el Gujarati gu Haitian Creole ht Hausa ha
Hawaiian haw Hebrew he Hindi hi Hmong hmn Hungarian hu Icelandic is
Igbo ig Indonesian id Irish ga Italian it Japanese ja Javanese jw
...
Explore more google translate supported language please visit: https://cloud.google.com/translate/docs/languages
:param text: Origin text
:param src_lang: source language. the ISO-639-1 language code of the input text
:param target_lang: target language. the ISO-639-1 language code of the output text
:return: translated text
"""
result = self.__translate(text, src_lang, target_lang)
obj_result = json.loads(result)
list_sentence = [x[0] for x in obj_result[0][:-1]]
return ''.join(list_sentence)
def translate_lines(self, text_list: list, src_lang: str, target_lang: str) -> str:
"""
Translate a text list into sentences.
:param text_list:
:param src_lang:
:param target_lang:
:return:
"""
translated = ''
last_idx = 0
total_length = 0
for i in range(len(text_list)):
total_length += len(text_list[i])
if total_length > self.max_limited:
translated += self.translate('\n'.join(text_list[last_idx:i]), src_lang, target_lang)
translated += '\n'
time.sleep(1)
last_idx = i
total_length = 0
translated += self.translate('\n'.join(text_list[last_idx:]), src_lang, target_lang)
return translated
if __name__ == '__main__':
t = Translator()
raw_text = "The Translation API's recognition engine supports a wide variety of languages for the Phrase-Based \
Machine Translation (PBMT) and Neural Machine Translation (NMT) models. \nThese languages are specified within a \
recognition request using language code parameters as noted on this page. \nMost language code parameters conform \
to ISO-639-1 identifiers, except where noted."
print(t.translate(raw_text, src_lang='en', target_lang='Zh-CN'))
print(t.translate(raw_text, src_lang='en', target_lang='ja'))