-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathbench.py
264 lines (218 loc) · 6.82 KB
/
bench.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
import subprocess
import os
import click
import statistics
import scrapy
import codespeedinfo
class commandoption(object):
def __init__(self, n_runs, only_result, upload_result, book_url, vmprof, set):
self.n_runs = n_runs
self.only_result = only_result
self.upload_result = upload_result
self.book_url = book_url
self.vmprof = vmprof
self.set = set
def calculator(
test,
arg,
n_runs,
only_result,
upload_result=False,
vmprof=False,
workpath=os.getcwd()):
w = []
command = 'python {}'.format(arg)
if vmprof:
command = 'python -m vmprof --web {}'.format(arg)
for x in range(n_runs):
if only_result:
process = subprocess.Popen(
command,
cwd=workpath,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
process.wait()
else:
process = subprocess.Popen(command, cwd=workpath, shell=True)
process.wait()
with open(os.path.join(workpath, "Benchmark.txt")) as f:
for line in f.readlines():
w.append(float(line))
click.secho(
"\nThe results of the benchmark are (all speeds in items/sec) : \n",
bold=True)
click.secho(
"\nTest = '{0}' Iterations = '{1}'\n".format(test, n_runs),
bold=True)
click.secho(
"\nMean : {0} Median : {1} Std Dev : {2}\n".format(
statistics.mean(w),
statistics.median(w),
statistics.pstdev(w)),
bold=True)
if upload_result:
codespeedinfo.uploadresult(test, w)
os.remove(os.path.join(workpath, "Benchmark.txt"))
@click.group(chain=True)
@click.option(
'--n-runs',
default=1,
help="Take multiple readings for the benchmark.")
@click.option('--only_result', is_flag=True, help="Display the results only.")
@click.option(
'--upload_result',
is_flag=True,
help="Upload the results to local codespeed")
@click.option(
'--book_url',
default="http://localhost/books.toscrape.com/",
help="Use with bookworm command. The url to book.toscrape.com on your local machine")
@click.option('--vmprof',
is_flag=True,
help="Profiling benchmarkers with Vmprof and upload the result to the web")
@click.option(
'--set',
'-s',
multiple=True,
help="Settings to be passed to the Scrapy command. Use with the bookworm/broadworm commands.")
@click.pass_context
def cli(ctx, n_runs, only_result, upload_result, book_url, vmprof, set):
"""A benchmark suite for Scrapy."""
ctx.obj = commandoption(n_runs, only_result, upload_result, book_url, vmprof, set)
@cli.command()
@click.pass_obj
def bookworm(obj):
"""Spider to scrape locally hosted site"""
scrapy_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'execute.py')
workpath = os.path.join(os.getcwd(), "books")
settings = " ".join("-s %s" % s for s in obj.set)
arg = "%s crawl followall -o items.csv -a book_url=%s %s" % (scrapy_path, obj.book_url, settings)
calculator(
"Book Spider",
arg,
obj.n_runs,
obj.only_result,
obj.upload_result,
obj.vmprof,
workpath)
os.remove(os.path.join(workpath, "items.csv"))
@cli.command()
@click.pass_obj
def broadworm(obj):
"""Broad crawl spider to scrape locally hosted sites"""
scrapy_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'execute.py')
workpath = os.path.join(os.getcwd(), "broad")
settings = " ".join("-s %s" % s for s in obj.set)
arg = "%s crawl broadspider -o items.csv %s" % (scrapy_path, settings)
calculator(
"Broad Crawl",
arg,
obj.n_runs,
obj.only_result,
obj.upload_result,
obj.vmprof,
workpath)
os.remove(os.path.join(workpath, "items.csv"))
@cli.command()
@click.argument('csv_file')
@click.argument('column')
@click.argument('protocol', default=None)
@click.pass_obj
def csv(obj, csv_file, column, protocol):
"""Visit URLs from a CSV file
Loads the specified CSV file (1st argument) and yields a request for each
URL in the specified column (2nd argument).
If you specify a protocol (-p, --protocol), such as http or https, column
values are interpreted as domains, and the specified protocol is used to
build request URLs.
This benchmark can be used, for example, to visit a long list of popular
internet domain names to see how Scrapy handles them. There are several
webpages, such as https://www.domcop.com/top-10-million-domains, that allow
downloading a CSV file with such a list.
A global concurrency limit of 100 is used by default. However, if you CPU
supports higher concurrency, you may want to override the
CONCURRENT_REQUESTS setting to set a higher value.
"""
scrapy_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'execute.py')
arguments = " ".join(
"-a '%s=%s'" % (key, value)
for key, value in (
('csv_file', csv_file),
('column', column),
('protocol', protocol),
)
if value is not None
)
settings = " ".join("-s '%s'" % s for s in obj.set)
arg = "%s runspider csvspider.py %s %s" % (scrapy_path, arguments, settings)
calculator(
"CSV Benchmark",
arg,
obj.n_runs,
obj.only_result,
obj.upload_result,
obj.vmprof
)
@cli.command()
@click.pass_obj
def linkextractor(obj):
"""Micro-benchmark for LinkExtractor()"""
arg = "link.py"
calculator(
"LinkExtractor",
arg,
obj.n_runs,
obj.only_result,
obj.upload_result,
obj.vmprof)
@cli.command()
@click.pass_obj
def cssbench(obj):
"""Micro-benchmark for extraction using css"""
arg = "cssbench.py"
calculator(
"css Benchmark",
arg,
obj.n_runs,
obj.only_result,
obj.upload_result,
obj.vmprof)
@cli.command()
@click.pass_obj
def xpathbench(obj):
"""Micro-benchmark for extraction using xpath"""
arg = "xpathbench.py"
calculator(
"xpath Benchmark",
arg,
obj.n_runs,
obj.only_result,
obj.upload_result,
obj.vmprof)
@cli.command()
@click.pass_obj
def itemloader(obj):
"""Item loader benchmarker"""
arg = "itemloader.py"
calculator(
"Item Loader benchmarker",
arg,
obj.n_runs,
obj.only_result,
obj.upload_result,
obj.vmprof)
@cli.command()
@click.pass_obj
def urlparseprofile(obj):
"""Urlparse benchmarker"""
arg = "urlparseprofile.py"
calculator(
"Urlparse benchmarker",
arg,
obj.n_runs,
obj.only_result,
obj.upload_result,
obj.vmprof)
if __name__ == '__main__':
cli()