-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathcrawler.py
67 lines (60 loc) · 2.19 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# -*- coding: utf-8 -*-
"""
@version: 0.1
@author: linus
@Email: linshaofeng1992@gmail.com
@file: crawler.py
@time: 2019/3/18 16:04
"""
import requests
from bs4 import BeautifulSoup
import re
sess = requests.Session()
sess.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36'
' (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36',
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
'Accept-Language': "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
'Accept-Encoding': "gzip, deflate, br",
'Connection': "keep-alive"
}
resp = sess.get('https://www.shanbay.com/wordbook/104791/')
soup = BeautifulSoup(resp.text, 'lxml')
url_list = list()
base_url = 'https://www.shanbay.com'
for i in soup.find_all('a', text=re.compile("程序员必学电脑计算机专业英语词汇 ")):
url_list.append(base_url + i['href'])
data = list()
for url in url_list:
resp = sess.get(url)
soup = BeautifulSoup(resp.text, 'lxml')
print(url)
table = soup.find('table', {'class': 'table table-bordered table-striped'})
for row in table.find_all('tr'):
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele])
word_num = int(soup.select('#wordlist-num-vocab')[0].text)
page_num = int(word_num / 20 + 1)
for i in range(2, page_num + 1):
an_url = url + '?page=' + str(i)
resp = sess.get(an_url)
soup = BeautifulSoup(resp.text, 'lxml')
table = soup.find('table', {'class': 'table table-bordered table-striped'})
for row in table.find_all('tr'):
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele])
temp = list()
dict_ = dict()
for i in data:
if i not in temp and len(i) == 2:
dict_[i[0]] = i[1]
for i, j in dict_.items():
dict_[i] = j.replace('\n', ' ')
tag = 0
with open('计算机词汇.txt', 'w') as f:
for i, j in dict_.items():
txt = str(tag) + '. ' + i + ': ' + j
f.write(str(txt) + '\n\n')
tag += 1