-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdice_test.py
189 lines (150 loc) · 5.97 KB
/
dice_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import requests,re,math,MySQLdb
from bs4 import BeautifulSoup
from config import *
from indeed import *
################################ START CREATE DATABASE #############################
connection = MySQLdb.connect(mysql['host'], mysql['user'], mysql['password'])
cursor = connection.cursor()
sql = 'CREATE DATABASE IF NOT EXISTS `'+ mysql['dataBaseName'] +'` CHARACTER SET utf8 COLLATE utf8_general_ci;'
cursor.execute(sql)
cursor.close()
################################ END CREATE DATABASE ###############################
################################ START connection to Mysql #########################
connection = MySQLdb.connect(mysql['host'], mysql['user'], mysql['password'], mysql['dataBaseName'])
cursor = connection.cursor()
################################ END connection to Mysql ##########################
################################ START CREATE TABLE ##############################
qry = """
CREATE TABLE IF NOT EXISTS """+mysql['DB_tableName1']+""" (
id INT NOT NULL AUTO_INCREMENT,
keyword text NOT NULL,
job_title text NOT NULL,
job_url text NOT NULL,
company text NOT NULL,
post_date date NOT NULL,
job_unique_id text NOT NULL,
Job_description text NOT NULL,
PRIMARY KEY (id) ) ENGINE = MYISAM DEFAULT CHARSET=utf8 ;
"""
qry2 = """
"""
try:
cursor = connection.cursor( MySQLdb.cursors.DictCursor )
cursor.execute(qry)
except:
connection.rollback()
################################ END CREATE TABLE ##############################
################################ START AUX FUNCTION ##############################
def getElementById(id,html1):
#class2 = dice[id]
for d2 in html1.select(id):
try:
element = d2.text.strip().encode('ascii', 'ignore').decode('ascii')
element = re.sub('\'', '\\\'', element).strip()
except ValueError:
print "ValueError on", id
return element
def getPostDate(id,html1,i):
element = html1.select(id)[i]
try:
element = element.text.strip().encode('ascii', 'ignore').decode('ascii')
element = re.sub('\'', '\\\'', element).strip()
except ValueError:
print "ValueError on", id
return element
def getDiceURL(id,html1):
element = html1.select(id)[0]
url = element['href'].strip().encode('ascii', 'ignore').decode('ascii')
return url
def getDiceJobDescription(url):
try:
response = requests.get(url)
except:
""
html = BeautifulSoup(response.content,"html5lib")
id = "#jobdescSec"
description = html.select(id)
text = ""
for tag in description:
tag = tag.text.strip().encode('ascii', 'ignore').decode('ascii')
tag = re.sub('\'', '\\\'', tag).strip()
text = text + tag
#text = text.strip()
return text
def insert_Into_Table2(keyword,job_title,job_url,company,post_date,job_unique_id,Job_description):
qry = """
INSERT INTO `""" + mysql['DB_tableName1'] + """` (`id`, `keyword`, `job_title`, `job_url`, `company`, `post_date`, `job_unique_id`, `Job_description`)
VALUES (NULL,
'""" + keyword + """',
'""" + job_title + """',
'""" + job_url + """',
'""" + company + """',
'""" + post_date + """',
'""" + job_unique_id + """',
'""" + Job_description + """');
"""
try:
cursor.execute(qry)
connection.commit()
except:
connection.rollback()
################################ START MAIN FUNCTION ##############################
def main():
keyword_dict = {}
f = open(input_keyword_file)
for keywords in f.readlines():
keyword = ''
keyword = keywords.strip()
print 'keyword = ', keyword
keyword_dict[keyword] = (0,0)
dup_count = 0
total_found = 0
fetched_count = 0
print "========================================"
print 'get filtered data'
print "========================================"
str1 = requests.get('https://www.dice.com/jobs?q=' + urllib.quote_plus(keyword) +'&limit=50&l=&searchid=9443532295942')
str1.raise_for_status()
html1 = BeautifulSoup(str1.content, "html5lib")
for i in range(0,50):
class1 = '#company'+str(i)
company = getElementById(class1,html1)
class1 = '#position'+str(i)
job_title = getElementById(class1,html1)
job_url = getDiceURL(class1,html1)
dice_id = job_url.split('/')[7].split('?')[0]
job_description = getDiceJobDescription(job_url)
#job_description = "test blah blah"
class1 = '.posted'
post_date = getPostDate(class1,html1,i)
post_date = convertStrDate(post_date)
#write 50 entries into it
insert_Into_Table2(keyword,job_title,job_url,company,post_date,dice_id,job_description)
fetched_count = fetched_count + 1
'''One perpage total Count of jobs'''
class1 = '#posiCountId'
#print "class1 = ", class1
for d1 in html1.select(class1):
try:
count_result = re.sub('.* of', '', d1.text).strip()
count_result = re.sub('[^\d+]', '', count_result).strip()
total_found = count_result
count_page = math.ceil(int(count_result)/50)
rest_count_page = int(count_result)%50
if(rest_count_page > 0):
count_page = count_page + 1
if(count_page>20):
count_page = 20
except ValueError as ve:
print "ValueError:", ve
total_found = count_result
print 'count_result = ', count_result
# print 'company:', company
# print 'title:', title
# print 'post_date:', post_date
# print 'url:' , url
# print 'dice_id:',dice_id
# print 'description:',job_description
keyword_dict[keyword] = (total_found, fetched_count)
print keyword_dict
if __name__ == "__main__": main()