diff --git a/bears/general/InvalidLinkBear.py b/bears/general/InvalidLinkBear.py index dc81b5e729..a26eaa22e2 100644 --- a/bears/general/InvalidLinkBear.py +++ b/bears/general/InvalidLinkBear.py @@ -42,7 +42,7 @@ def run(self, filename, file, :param follow_redirects: Set to true to autocorrect redirects. """ for result in dependency_results.get(URLHeadBear.name, []): - line_number, link, code, context = result.contents + line_number, link, code, context, robots_allowed = result.contents if context is context.xml_namespace: if code and 200 <= code < 300: pass @@ -54,6 +54,14 @@ def run(self, filename, file, file=filename, line=line_number, severity=RESULT_SEVERITY.INFO) + elif not robots_allowed: + yield Result.from_values( + origin=self, + message=('robots.txt does not allow request to ' + '{url}').format(url=link), + file=filename, + line=line_number, + severity=RESULT_SEVERITY.NORMAL) elif code is None: yield Result.from_values( origin=self, diff --git a/bears/general/URLHeadBear.py b/bears/general/URLHeadBear.py index 9d5497c3f9..55c9f42307 100644 --- a/bears/general/URLHeadBear.py +++ b/bears/general/URLHeadBear.py @@ -11,6 +11,7 @@ from coalib.settings.Setting import typed_dict from coala_utils.decorators import (enforce_signature, generate_ordering, generate_repr) +from urllib import robotparser @generate_repr(('id', hex), @@ -41,7 +42,8 @@ class URLHeadResult(HiddenResult): def __init__(self, origin, affected_code, link: str, head_response: (requests.models.Response, Exception), - link_context: LINK_CONTEXT): + link_context: LINK_CONTEXT, + robots_allowed: bool): http_status_code = (head_response.status_code if isinstance(head_response, @@ -52,11 +54,12 @@ def __init__(self, origin, affected_code, affected_code) self.contents = [affected_code[0].start.line, link, http_status_code, - link_context] + link_context, robots_allowed] self.link = link self.http_status_code = http_status_code self.link_context = link_context self.head_response = head_response + self.robots_allowed = robots_allowed class URLHeadBear(LocalBear): @@ -88,6 +91,13 @@ def get_head_response(url, timeout): except requests.exceptions.RequestException as exc: return exc + @staticmethod + def get_robots_file(host): + rp = robotparser.RobotFileParser() + rp.set_url('https://' + host + '/robots.txt') + rp.read() + return rp + @deprecate_settings(network_timeout=('timeout', lambda t: {'*': t})) def run(self, filename, file, dependency_results=dict(), network_timeout: typed_dict(str, int, DEFAULT_TIMEOUT) = dict(), @@ -117,15 +127,24 @@ def run(self, filename, file, dependency_results=dict(), if not url == '*' else '*': timeout for url, timeout in network_timeout.items()} + robots_dict = {} + for result in dependency_results.get(URLBear.name, []): host = urlparse(result.link).netloc - head_resp = self.get_head_response( - result.link, - network_timeout.get(host) - if host in network_timeout - else network_timeout.get('*') - if '*' in network_timeout - else URLHeadBear.DEFAULT_TIMEOUT) - - yield URLHeadResult(self, result.affected_code, result.link, - head_resp, result.link_context) + if host not in robots_dict.keys(): + robots_dict[host] = self.get_robots_file(host) + if robots_dict[host].can_fetch('*', result.link): + head_resp = self.get_head_response( + result.link, + network_timeout.get(host) + if host in network_timeout + else network_timeout.get('*') + if '*' in network_timeout + else URLHeadBear.DEFAULT_TIMEOUT) + + yield URLHeadResult(self, result.affected_code, result.link, + head_resp, result.link_context, True) + else: + yield URLHeadResult(self, result.affected_code, result.link, + requests.models.Response(), + result.link_context, False)