-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathcheck_mismatch_file.py
286 lines (225 loc) · 10.1 KB
/
check_mismatch_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
"""
Script that checks the formatting of a mismatch file to see if it's valid to uploaded to Mismatch Finder.
Note: the upload limit for the Mismatch Finder API is 10 MB.
Please see the Mismatch Finder User Guide for more information:
https://github.com/wmde/wikidata-mismatch-finder/blob/development/docs/UserGuide.md
Usage:
python3 check_mismatch_file.py --mismatch-file MISMATCH_FILE --verbose
Example:
python3 check_mismatch_file.py --mismatch-file test_mismatches.csv --verbose
Abbreviated arguments usage:
python3 check_mismatch_file.py -mf MISMATCH_FILE -v
Abbreviated arguments example:
python3 check_mismatch_file.py -mf test_mismatches.csv -v
"""
import argparse
import os
import re
import numpy as np
import pandas as pd
from urllib.parse import urlparse
# Section: Functions to check the passed mismatch file.
def _validate_url(url):
"""
Check that a value is not null and is a valid URL if so.
"""
if pd.isnull(url) is None:
try:
url_parse = urlparse(url)
return all([url_parse.scheme, url_parse.netloc])
except:
return False
return True
def check_mf_formatting(df: pd.DataFrame):
"""
Checks a Pandas DataFrame to see whether it will produce a valid CSV for Mismatch Finder.
For conditions, please see:
https://github.com/wmde/wikidata-mismatch-finder/blob/main/docs/UserGuide.md#creating-a-mismatches-import-file
Parameters
----------
df: pandas.DataFrame
A DataFrame for which we want to run df.to_csv().
Returns
-------
A message of whether or not the DataFrame is valid and directions to fix any issues if needed.
"""
df_formatted_correctly = True
correction_instruction = []
# 1. Check that all required columns are included.
required_columns = [
"item_id",
"statement_guid",
"property_id",
"wikidata_value",
"meta_wikidata_value",
"external_value",
"external_url",
"type",
]
if list(df.columns) != required_columns:
df_formatted_correctly = False
required_columns_string = "'" + "', '".join(required_columns) + "'"
correction_instruction.append(
f"Please check that the following columns are present in this exact order:\n {required_columns_string}"
)
# 2. Check that all QIDs and PIDs are formatted correctly.
id_columns = ["item_id", "property_id"]
id_columns_included = [c for c in id_columns if c in df.columns]
columns_with_invalid_ids = []
for c in id_columns_included:
if c == "item_id":
if not df[c].astype(str).str.match(r"^Q\d+$").all():
columns_with_invalid_ids.append(c)
elif c == "property_id":
if not df[c].astype(str).str.match(r"^P\d+$").all():
columns_with_invalid_ids.append(c)
if columns_with_invalid_ids:
df_formatted_correctly = False
invalid_id_correction_message = (
"Please assure that the following columns have valid ids:"
)
for c in columns_with_invalid_ids:
invalid_id_correction_message += f"\n - {c}"
correction_instruction.append(invalid_id_correction_message)
# 3. Check that there are no nulls in non-optional columns.
required_value_columns = ["item_id", "property_id", "external_value"]
required_value_columns_included = [
c for c in required_value_columns if c in df.columns
]
columns_with_nulls = []
for c in required_value_columns_included:
if df[c].isnull().values.any():
columns_with_nulls.append(c)
if columns_with_nulls:
df_formatted_correctly = False
null_value_correction_message = (
"Please assure that the following columns do not have null values:"
)
for c in columns_with_nulls:
null_value_correction_message += f"\n - {c}"
correction_instruction.append(null_value_correction_message)
# 4. Check that values exist for all rows where there is a statement.
if "statement_guid" in df.columns and "wikidata_value" in df.columns:
guids = df["statement_guid"].values
wd_values = df["wikidata_value"].values
check_empty_value_list = [
not pd.isnull(wd_values[i]) and pd.isnull(guids[i])
for i in range(len(guids))
]
if True in check_empty_value_list:
df_formatted_correctly = False
correction_instruction.append(
"Please assure that `statement_guid` is null only in cases where `wikidata_value` is as well."
)
guid_pattern = re.compile(r"^Q\d+\$\w{8}-\w{4}-\w{4}-\w{4}-\w{12}$")
guid_pattern_matches = df["statement_guid"].apply(
lambda x: bool(guid_pattern.match(str(x))) if not pd.isnull(x) else True
)
if not guid_pattern_matches.any():
df_formatted_correctly = False
correction_instruction.append(
"Some values in the column `statement_guid` are not formatted correctly. GUIDs begin with a QID, which is then followed by a dollar sign and alphanumeric characters separated by dashes."
)
# 5. Check that all external URLs are valid.
if "external_url" in df.columns:
url_validation_checks = [_validate_url(u) for u in df["external_url"]]
if False in url_validation_checks:
df_formatted_correctly = False
invalid_urls = [
df["external_url"][i]
for i in range(len(url_validation_checks))
if not url_validation_checks[i]
]
url_correction_message = "Please check the following URLs in `external_url` to make sure that they're valid:"
for u in invalid_urls:
url_correction_message += f"\n - {u}"
correction_instruction.append(url_correction_message)
# 6. Check that all type values are 'statement', 'qualifier' or a null value that will be made 'statement'.
if "type" in df.columns:
allowed_types = set(["statement", "qualifier", np.nan])
included_types = set(df["type"].unique())
if not set(included_types).issubset(allowed_types):
df_formatted_correctly = False
correction_instruction.append(
"Please check that the `type` column contains only: 'statement', 'qualifier' or a null value."
)
# 7. Check that values for certain columns are less than 1,500 characters.
check_value_length_columns = ["wikidata_value", "external_value", "external_url"]
check_value_length_columns_included = [
c for c in check_value_length_columns if c in df.columns
]
columns_with_too_long_values = []
for c in check_value_length_columns_included:
if (df[c].astype(str).str.len() > 1500).any():
columns_with_too_long_values.append(c)
if columns_with_too_long_values:
df_formatted_correctly = False
too_long_value_correction_message = "Please assure that the following columns do not have values over 1,500 characters:"
for c in columns_with_too_long_values:
too_long_value_correction_message += f"\n - {c}"
correction_instruction.append(too_long_value_correction_message)
# Raise exception if there's a data formatting issue or print that all checks have passed.
if not df_formatted_correctly:
mf_file_creation_directions = """
There's a problem with the DataFrame. Please see the Mismatch Finder file creation directions on GitHub:
https://github.com/wmde/wikidata-mismatch-finder/blob/main/docs/UserGuide.md#creating-a-mismatches-import-file
Directions on how to fix the DataFrame are also detailed below:
"""
value_error_message = mf_file_creation_directions + "".join(
f"\n{i+1}. {correction_instruction[i]}\n"
for i in range(len(correction_instruction))
)
raise ValueError(value_error_message)
else:
print(
"All checks have passed! The data is ready to be uploaded to Mismatch Finder."
)
# Section: helper classes and functions for the script.
class terminal_colors:
"""
Class for easily applying terminal colors for better warnings.
"""
WD_RED = "\033[38;2;153;0;0m"
RESET = "\033[0m"
def lower(s: str):
"""
Returns a string with the first letter lowercased.
"""
return s[:1].lower() + s[1:] if s else ""
# Section: Set arguments for the script.
parser = argparse.ArgumentParser()
parser._actions[0].help = "Show this help message and exit."
parser.add_argument(
"-v", "--verbose", help="Increase output verbosity.", action="store_true"
)
parser.add_argument(
"-mf",
"--mismatch-file",
help="Path to the CSV file containing mismatches to import to Mismatch Finder.",
)
args = parser.parse_args()
VERBOSE = args.verbose
MISMATCH_FILE = args.mismatch_file
# Section: Assertions for passed arguments.
assert MISMATCH_FILE, f"""Please provide a path via the --mismatch-file (-mf) argument:
--mismatch-file (-mf): a {lower(parser._actions[2].help)}"""
# Assert that the file exists and that it is a CSV with a filesize less than 10 MB.
if MISMATCH_FILE:
assert os.path.isfile(
MISMATCH_FILE
), f"Mismatch file not found. Please provide a {lower(parser._actions[2].help)}"
assert (
MISMATCH_FILE[-4:] == ".csv"
), f"Mismatch file not a CSV. Please provide a {lower(parser._actions[2].help)}"
mf_size = os.path.getsize(MISMATCH_FILE) >> 20
if not mf_size < 10:
print(
f"\n{terminal_colors.WD_RED}WARNING: The size of the passed mismatch file via the --mismatch-file (-mf) argument is greater than the Mismatch Finder import file size limit of 10 MB. Please break this file down into smaller CSV files using `split_mismatch_file.py` before attempting to upload the file.{terminal_colors.RESET}\n"
)
# Section: Run check_mf_formatting over the provided mismatch file.
if VERBOSE:
print(
f"Checking the data within the mismatch file {MISMATCH_FILE} to see if it's valid for uploading to Mismatch Finder..."
)
df_mismatch_file = pd.read_csv(MISMATCH_FILE)
check_mf_formatting(df=df_mismatch_file)