-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathobc_new.py
114 lines (92 loc) · 3.86 KB
/
obc_new.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
"""
"""
import xml.etree.ElementTree as ET
import requests
OBC_IDE_URL = "http://devapi.onbc.io/publications"
class PubmedArticle():
def __init__(self, xml): # start from xml as string for now
self.xml = xml
data = self._parse_xml(xml)
self.doi = data["doi"]
self.pmid = data["pmid"]
def _parse_xml(self, root):
# parse the data we're interested from the xml (passed as a string)
# return the data as a dictionary
# function assumes that xml root is a PubMedArticle node,
# not a PubmedArticleSet node
result = {"pmid": None, "doi": None}
for article_id in root.find("PubmedData").find("ArticleIdList").findall("ArticleId"):
if article_id.get("IdType") == "pubmed":
result["pmid"] = article_id.text
elif article_id.get("IdType") == "doi":
result["doi"] = article_id.text
else:
continue
return result
# these below lines are possible, but more more complicated than just using <ArticleIdList>
# if root.find("MedlineCitation").find("PMID"):
# result["pmid"] = root.find("MedlineCitation")[0].text
# else:
# result["pmid"] = None
# result["doi"] = None
# for eid in root.find("MedlineCitation").find("Article").findall("ELocationID"):
# if eid.get("EIdType") == "doi":
# result["doi"] = eid.text
#return result
def publication_in_obc(self):
"""Return True if an article is found in the obc.ide /publications api response.
Otherwise, returns False. Checks against pmid first, then doi if pmid is
not found. Accepts one argument, a PubmedArticle instance.
"""
entries = requests.get(OBC_IDE_URL).json()
for entry in entries:
if self.pmid == entry.get("pmid", None):
print(entry.get("pmid", None))
return True
else:
if self.doi is not None and self.doi == entry.get("doi", None):
print(entry.get("doi", None))
return True
return False
def sort_xml(path):
# sort PubmedArticles from specified .xml path into two buckets:
# those that are found in obc, and those that are not found in obc
in_obc = []
not_in_obc = []
# loop thru the <PubmedArticleSet> node
tree = ET.parse(path)
root = tree.getroot()
paper = defaultdict(list)
grant=[]
for node in root:
article = PubmedArticle(node)
if article.publication_in_obc():
in_obc.append(article)
else:
for l in (node.findall('./MedlineCitation/Article/GrantList/Grant/GrantID')):
grantid= l.text.split(" ")
#print(pubid)
if len(grantid)==3:
grantid=grantid[1]+grantid[2]
paper["".join(grantid)].append(node.find('./MedlineCitation')[0].text)
else:
paper[" ".join(grantid)].append(node.find('./MedlineCitation')[0].text)
not_in_obc.append(article)
return in_obc, not_in_obc,paper
def write_to_xml(lst, path):
# given a list of [PubmedArticle], write the list to xml file
strs = [ET.tostring(a.xml).decode('utf-8') + "\n" for a in lst]
with open(path, 'w') as f:
f.writelines("<PubmedArticleSet>"+'\n')
f.writelines(strs)
f.writelines('\n'+"</PubmedArticleSet>")
return
def get_xml():
today=dt.datetime.today().strftime(("%Y-%m-%d"))
filepath=path1+"\\"+("most_recent_publications_{}.xml".format(today))
in_obc, not_in_obc,paper = sort_xml(filepath)
print("In OBC: " + str(len(in_obc)))
print("Not In OBC: " + str(len(not_in_obc)))
write_to_xml(in_obc, path1+"\\"+'in_obc.xml')
write_to_xml(not_in_obc, path1+"\\"+'not_in_obc.xml')
return paper