forked from simon987/opendirectories-bot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl_report.py
124 lines (79 loc) · 2.89 KB
/
crawl_report.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import humanfriendly
import datetime
import json
class ReportBuilder:
def __init__(self, files: list, base_url: str):
self.files = files
self.report_time = datetime.datetime.today()
self.base_url = base_url
def get_total_size(self):
size = 0
for f in self.files:
size += f["size"]
return size
def get_total_size_formatted(self):
size = self.get_total_size()
if size == 0:
return "Unknown"
return humanfriendly.format_size(size, True)
def get_ext_counts(self):
ext_counts = dict()
for f in self.files:
ext = f["ext"].lower()
if ext in ext_counts:
ext_counts[ext] += 1
else:
ext_counts[ext] = 1
return ext_counts
def get_ext_sizes(self):
ext_sizes = dict()
for f in self.files:
ext = f["ext"].lower()
if ext in ext_sizes:
ext_sizes[ext] += f["size"]
else:
ext_sizes[ext] = f["size"]
return ext_sizes
def get_ext_sizes_formatted(self):
ext_sizes = self.get_ext_sizes()
for ext in ext_sizes:
ext_sizes[ext] = humanfriendly.format_size(ext_sizes[ext], True)
return ext_sizes
class ReportSaver:
def __init__(self, files, title, builder: ReportBuilder):
self.files = files
self.builder = builder
self.title = title
def to_json(self):
out = dict()
out["files"] = []
base_url_len = len(self.builder.base_url)
for f in self.files:
stripped_url = f["link"][base_url_len-1:]
out["files"].append(stripped_url)
out["total_size"] = self.builder.get_total_size()
out["base_url"] = self.builder.base_url
out["total_size_formatted"] = self.builder.get_total_size_formatted()
out["ext_count"] = self.builder.get_ext_counts()
out["ext_sizes"] = self.builder.get_ext_sizes()
out["ext_sizes_formatted"] = self.builder.get_ext_sizes_formatted()
out["report_time"] = str(self.builder.report_time)
out["total_count"] = len(self.builder.files)
out["post_title"] = self.title
return json.dumps(out)
def to_json_chart(self):
out = dict()
out["total_size"] = self.builder.get_total_size()
out["base_url"] = self.builder.base_url
out["ext_count"] = self.builder.get_ext_counts()
out["ext_sizes"] = self.builder.get_ext_sizes()
out["report_time"] = str(self.builder.report_time)
out["total_count"] = len(self.builder.files)
out["post_title"] = self.title
return json.dumps(out)
def to_link_list(self):
out = ""
for f in self.files:
out += f["link"] + "\n"
out = out[:-1] # Remove trailing newline
return out