Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

URLHeadBear.py: Use robots.txt #2891

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion bears/general/InvalidLinkBear.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def run(self, filename, file,
:param follow_redirects: Set to true to autocorrect redirects.
"""
for result in dependency_results.get(URLHeadBear.name, []):
line_number, link, code, context = result.contents
line_number, link, code, context, robots_allowed = result.contents
if context is context.xml_namespace:
if code and 200 <= code < 300:
pass
Expand All @@ -54,6 +54,14 @@ def run(self, filename, file,
file=filename,
line=line_number,
severity=RESULT_SEVERITY.INFO)
elif not robots_allowed:
yield Result.from_values(
origin=self,
message=('robots.txt does not allow request to '
'{url}').format(url=link),
file=filename,
line=line_number,
severity=RESULT_SEVERITY.NORMAL)
elif code is None:
yield Result.from_values(
origin=self,
Expand Down
43 changes: 31 additions & 12 deletions bears/general/URLHeadBear.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from coalib.settings.Setting import typed_dict
from coala_utils.decorators import (enforce_signature, generate_ordering,
generate_repr)
from urllib import robotparser


@generate_repr(('id', hex),
Expand Down Expand Up @@ -41,7 +42,8 @@ class URLHeadResult(HiddenResult):
def __init__(self, origin, affected_code,
link: str,
head_response: (requests.models.Response, Exception),
link_context: LINK_CONTEXT):
link_context: LINK_CONTEXT,
robots_allowed: bool):

http_status_code = (head_response.status_code if
isinstance(head_response,
Expand All @@ -52,11 +54,12 @@ def __init__(self, origin, affected_code,
affected_code)

self.contents = [affected_code[0].start.line, link, http_status_code,
link_context]
link_context, robots_allowed]
self.link = link
self.http_status_code = http_status_code
self.link_context = link_context
self.head_response = head_response
self.robots_allowed = robots_allowed


class URLHeadBear(LocalBear):
Expand Down Expand Up @@ -88,6 +91,13 @@ def get_head_response(url, timeout):
except requests.exceptions.RequestException as exc:
return exc

@staticmethod
def get_robots_file(host):
rp = robotparser.RobotFileParser()
rp.set_url('https://' + host + '/robots.txt')
rp.read()
return rp

@deprecate_settings(network_timeout=('timeout', lambda t: {'*': t}))
def run(self, filename, file, dependency_results=dict(),
network_timeout: typed_dict(str, int, DEFAULT_TIMEOUT) = dict(),
Expand Down Expand Up @@ -117,15 +127,24 @@ def run(self, filename, file, dependency_results=dict(),
if not url == '*' else '*': timeout
for url, timeout in network_timeout.items()}

robots_dict = {}

for result in dependency_results.get(URLBear.name, []):
host = urlparse(result.link).netloc
head_resp = self.get_head_response(
result.link,
network_timeout.get(host)
if host in network_timeout
else network_timeout.get('*')
if '*' in network_timeout
else URLHeadBear.DEFAULT_TIMEOUT)

yield URLHeadResult(self, result.affected_code, result.link,
head_resp, result.link_context)
if host not in robots_dict.keys():
robots_dict[host] = self.get_robots_file(host)
if robots_dict[host].can_fetch('*', result.link):
head_resp = self.get_head_response(
result.link,
network_timeout.get(host)
if host in network_timeout
else network_timeout.get('*')
if '*' in network_timeout
else URLHeadBear.DEFAULT_TIMEOUT)

yield URLHeadResult(self, result.affected_code, result.link,
head_resp, result.link_context, True)
else:
yield URLHeadResult(self, result.affected_code, result.link,
requests.models.Response(),
result.link_context, False)