-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
125 lines (103 loc) · 3.87 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from mediawiki import MediaWiki
from bs4 import BeautifulSoup
import json
from urllib.parse import urlencode
from urllib.request import urlopen
import mwparserfromhell
import re
from datetime import timedelta
import sys
import pprint
from pymongo import MongoClient
API_URL = "https://practicalplants.org/w/api.php"
def get_sections(page_title, wiki):
section_titles = []
wiki_page = wiki.page(page_title)
page_html = wiki_page.html
soup = BeautifulSoup(page_html, "html.parser")
section_list = soup.select("div.infobox-title")
for section in section_list:
section_titles.append(section.get_text())
return section_titles
def parse(title):
data = {"action": "query", "prop": "revisions", "rvlimit": 1,
"rvprop": "content", "format": "json", "titles": title}
try:
raw = urlopen(API_URL, urlencode(data).encode()).read()
except:
print("Oops, something is broken. Retrying...")
parse(title)
res = json.loads(raw)
text = next(iter(res["query"]["pages"].values()))["revisions"][0]["*"]
return mwparserfromhell.parse(text)
def get_param_dict(templates):
param_dict = {}
mw_template = templates[0]
for entry in mw_template.params:
value = None
split_entry = entry.strip().split("=")
if len(split_entry) < 2:
value = None
elif len(split_entry) > 2:
if "{{" in mw_template.get(split_entry[0]).value and "}}" in mw_template.get(split_entry[0]):
result = re.sub('{{(.*)}}', '', str(mw_template.get(split_entry[0]).value))
if result:
value = result
else:
value = split_entry[1]
param_dict[split_entry[0]] = value
for key in param_dict.keys():
if param_dict[key] is None:
continue
try:
param_dict[key] = re.sub('{{(.*)}}', '', param_dict[key])
param_dict[key] = re.sub('PFAF(.*[0-9])', '', param_dict[key])
except TypeError:
print("TypeError:", param_dict[key])
sys.exit(1)
templates = mwparserfromhell.parse(param_dict[key]).filter_templates()
if templates:
param_dict[key] = get_param_dict(templates)
if " " not in param_dict[key] and "," in param_dict[key]:
param_dict[key] = param_dict[key].split(",")
return param_dict
try:
connection = MongoClient("mongodb://localhost:27017")
connection.database_names()
db = connection.database
crops = db.crops
except:
print("MongoDB connection has failed somehow...")
sys.exit(1)
# INFO
pp = pprint.PrettyPrinter(indent=4)
full_section_list = []
record_number_ingested = 0
# Get NUM_RESULTS mediawiki pages in specific category
# Set NUM_RESULTS to None to get all pages in category
CATEGORY = "Plant"
NUM_RESULTS = None
plantwiki = MediaWiki(url=API_URL, rate_limit=True, rate_limit_wait=(timedelta(seconds=1)), timeout=100)
all_plant_names = plantwiki.categorymembers(CATEGORY, results=NUM_RESULTS, subcategories=False)
for plant_name in all_plant_names:
try:
wikicode = parse(plant_name)
except ValueError:
print("Encountered an issue getting", plant_name, "from wiki. Continuing on...")
continue
templates = wikicode.filter_templates()
plant_info = get_param_dict(templates)
record_number_ingested += 1
# INFO: Getting sections takes quite a bit of time
# section_list = get_sections(all_plant_names[0], plantwiki)
# for section in section_list:
# if section not in full_section_list:
# full_section_list.append(section)
# MORE INFO
print("Name:", plant_name)
print("MongoDB ID:", crops.insert_one(plant_info).inserted_id, "\n")
#4print("Section list:", section_list)
# EVEN MORE INFO
# print("\nList of section keys:")
# pp.pprint(full_section_list)
print("Records ingested:", record_number_ingested)