forked from simon987/opendirectories-bot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
116 lines (87 loc) · 3.69 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import requests
from parsing import NginxParser, ApacheParser
from crawl_report import ReportSaver, ReportBuilder
import os
class Crawler:
def __init__(self, url, test_url):
self.files = []
self.parsed_urls = []
self.base_url = os.path.join(url, '')
if url.startswith("http"):
if test_url:
# Test url
try:
r = requests.get(self.base_url, timeout=10) # todo change to 30
if r.status_code == 200:
self.parser = self.guess_parser(r.text, r.headers)()
print("Using " + self.parser.__class__.__name__ + " as parser")
else:
print("Couldn't connect (" + str(r.status_code) + ")")
self.parser = None
except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectTimeout, requests.exceptions.ConnectionError):
print("Timed out / Connection refused")
self.parser = None
else:
print("Using ApacheParser by default because test_url was set to False")
self.parser = ApacheParser() # Default parser
else:
print("Invalid Schema")
self.parser = None
@staticmethod
def guess_parser(text, headers):
server = headers["Server"] if "Server" in headers else ""
# try nginx
parser = NginxParser()
if parser.page_is_valid(text):
return NginxParser
# Try apache
parser = ApacheParser()
if parser.page_is_valid(text):
return ApacheParser
return None
def crawl(self, address=None):
# Prevent unwanted recursion
if address is not None and address in self.parsed_urls:
return
self.parsed_urls.append(address)
if self.parser is None:
return
if address is None:
address = self.base_url
if not address.startswith(self.base_url):
print("Skipping " + address + " because it does not match " + self.base_url)
return
retries = 20
while retries >= 0:
try:
response = requests.get(address, timeout=10)
break
except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError):
print("Timeout, " + str(retries) + " retries left")
retries -= 1
if retries == 0:
return
links = self.parser.get_links(response.text, address)
for k in links:
if links[k]["type"] == "d":
print(links[k]["link"])
self.crawl(links[k]["link"])
else:
self.files.append(dict(link=links[k]["link"], size=links[k]["size"], ext=links[k]["ext"]))
def store_report(self, report_id, title):
report_saver = ReportSaver(self.files,title, ReportBuilder(self.files, self.base_url))
with open("static/reports/" + report_id + "_chart.json", "w") as f:
f.write(report_saver.to_json_chart())
with open("static/reports/" + report_id + ".json", "w") as f:
f.write(report_saver.to_json())
with open("static/reports/" + report_id + ".txt", "w") as f:
f.write(report_saver.to_link_list())
if __name__ == "__main__":
c = Crawler("http://www.downloads.imune.net/medicalbooks/", True)
c.crawl()
r = ReportBuilder(c.files, "http://www.downloads.imune.net/medicalbooks/")
print(r.get_total_size_formatted())
# for f in c.files:
# if f["size"] > 1000000:
# print(f)
c.store_report("000011", "test")