-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget-req.py
228 lines (198 loc) · 6.9 KB
/
get-req.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
from bs4 import BeautifulSoup
import urllib.request
import pyodbc
import re
import time
import json
# Get Rankings for each program
def getRankings():
overall_url = 'https://www.macleans.ca/education/canadas-top-school-by-reputation-2020/'
program_url = 'https://www.macleans.ca/?p={}'
### Page id corresponding to each program
dict_programs = {1184826:'Biology', 1184828:'Business', 1184829:'Computer Science', 1184830:'Education', 1184831:'Engineering', 1184832:'Environmental-Science', 1184833:'Mathematics', 1184834:'Medicine', 1184835:'Nursing', 1184836:'Psychology'}
try:
page = urllib.request.urlopen(overall_url) # Make Connection
except:
print("An error occured.")
soup = BeautifulSoup(page, 'html.parser')
table = soup.find("table", {'class': 'footable rdm-footable'})
table_body = table.find('tbody')
data = []
# Get Overall Rankings
# After finding the table with the related data, retrieve table contents row by row
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
top_universities = []
for i in range(0, len(data)):
top_universities.append(data[i][:2])
print('Overall Ranking')
return (top_universities)
# Get Ranking by program,
print('\nBy Program')
for key, program in dict_programs.items():
print(program)
try:
page = urllib.request.urlopen(program_url.format(key)) #Make Connection
except:
print("An Error has occured.")
soup = BeautifulSoup(page, 'html.parser')
table = soup.find("table", {'class': 'footable rdm-footable'})
table_body = table.find('tbody')
data = []
rows = table_body.find_all('tr')
# After table is scraped for the releavnt program, skim through table row by row
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
overall_rank = []
for row in data:
overall_rank.append(row[:2])
print(overall_rank)
def getTuition():
url = 'https://www.univcan.ca/universities/facts-and-stats/tuition-fees-by-university/'
try:
page = urllib.request.urlopen(url) #Make Connection
except:
print("An error occured.")
soup = BeautifulSoup(page, 'html.parser')
table = soup.find("tbody")
data = []
rows = table.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
tuition_summary = []
for row in data:
for i in range(0, len(row)):
row[i] = row[i].replace('\xa0', '')
tuition_summary.append(row)
return tuition_summary
# Connect to a database to load results
def connectDB():
endpoint = 'database-unifind.ccyugazvu1zd.ca-central-1.rds.amazonaws.com'
user = 'admin'
pw = '{}'
conn = 0
try:
conn = pyodbc.connect(
driver = 'ODBC Driver 17 for SQL Server',
server = endpoint,
port = 3306,
user = user,
password = pw,
timeout = 5
)
print('Connection Successful')
sql = 'CREATE TABLE Test( id INT PRIMARY KEY);'
crsr = conn.cursor()
crsr.execute(sql)
row = crsr.fetchone()
print (row[0])
except:
print("An Error has occured.")
finally:
conn.close()
# sql = 'select @@version'
# conn.close()
def normalizeUniNames(data):
dict_names = {
'UBC':'The University of British Columbia',
'Toronto': 'University of Toronto',
'Waterloo': 'University of Waterloo',
'McGill' : 'McGill University',
'McMaster': 'McMaster University',
'Alberta': 'University of Alberta',
"Queen's" : "Queen's University",
'Western' : 'Western University (excludes colleges)',
'Simon Fraser' : 'Simon Fraser University',
'Montreal' : 'Université de Montréal',
'Calgary' : 'University of Calgary',
'Guelph': 'University of Guelph',
'Victoria' : 'University of Victoria',
'Dalhousie' : 'Dalhousie University',
'Laval' : 'Université Laval',
'Ryerson' : 'Ryerson University',
'Concordia': 'Concordia University',
'Ottawa' : 'University of Ottawa',
'Sherbrooke' : 'Université de Sherbrooke',
'Saskatchewan' : 'University of Saskatchewan',
'York' : 'York University' ,
'Memorial' : 'Memorial University' ,
'Manitoba' : 'University of Manitoba' ,
'Carleton' : 'Carleton University' ,
'Wilfrid Laurier' : 'Wilfrid Laurier University' ,
'UQAM' : 'Université du Québec à Montréal' ,
'Mount Allison' : 'Mount Allison University' ,
'St. Francis Xavier' : 'St. Francis Xavier University' ,
'Acadia' : 'Acadia University' ,
'Trent' : 'Trent University' ,
'New Brunswick' : 'New Brunswick University' ,
'Ontario Tech' : 'University of Ontario Institute of Technology' ,
'Regina' : 'University of Regina' ,
'UNBC' : 'University of Northern British Columbia' ,
'Lethbridge' : 'University of Lethbridge' ,
'Saint Mary\'s' : 'Saint Mary\'s University' ,
'Winnipeg' : 'University of Winnipeg' ,
'Brock' : 'Brock University' ,
'Bishop\'s' : 'Bishop\'s University' ,
'Windsor' : 'University of Windsor' ,
'UPEI' : 'University of Prince Edward Island' ,
'Lakehead' : 'Lakehead University' ,
'Mount Saint Vincent' : 'Mount Saint Vincent University' ,
'Moncton' : 'Université de Moncton' ,
'Laurentian' : 'Laurentian University' ,
'Cape Breton' : 'Cape Breton University' ,
'St. Thomas' : 'St. Thomas University' ,
'Brandon' : 'Brandon University' ,
'Nipissing' : 'Nipissing University'
}
for i in range(0,len(data)):
if data[i][1] in list(dict_names.keys()):
data[i][1] = dict_names.get(data[i][1])
return data
def mergeTuitionRankingData(tuition_data, ranking_data):
output = []
for i in range(0, len(tuition_data)):
for j in range(0, len(ranking_data)):
if ranking_data[j][1] == tuition_data[i][0]:
tuition_data[i].append(ranking_data[j][0])
output.append(tuition_data[i])
return output
tuition_data = getTuition()
ranking_data = getRankings()
normalized = normalizeUniNames(ranking_data)
merged = mergeTuitionRankingData(tuition_data, normalized)
# Write results to JSON file
with open('data.json', 'w') as f:
aggregate = {}
aggregate['universities'] = []
for ele in merged:
if len(ele) > 6:
data = {}
data[1] = ele[6]
data[2] = ele[0]
data[3] = ele[1]
data[4] = ele[2]
data[5] = ele[3]
data[6] = ele[4]
data[7] = ele[5]
# data['rank'] = ele[6]
# data['name'] = ele[0]
# data['domestic-under'] = ele[1]
# data['foreign-under'] = ele[2]
# data['domestic-grad'] = ele[3]
# data['foreign-grad'] = ele[4]
# data['location'] = ele[5]
# aggregate['universities'].append(data)
# print(aggregate)
row = '<tr>'
for i in range(1,7):
row = row + ('<td class="column{}">'.format(i)+data[i]+'</td>')
row= row + ('</tr>')
print(row)
# json.dump(aggregate, f)