-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfeedgrep.py
126 lines (96 loc) · 3.37 KB
/
feedgrep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
r"""rawdog plugin to limit articles using regular expressions
Copyright 2005 Steve Atwell <atwell@uiuc.edu>
This rawdog plugin filters articles for a feed using Python's regular
expressions. Only articles containing a match to the regular expression
are kept. Both the title and the description are searched. It adds a
"grep" feed option with the following syntax:
grep [opts] regular expression
The following options are supported:
-i Perform case-insensitive matching.
-s Strip HTML tags and newlines. Tags and newlines are converted
to spaces, and multiple spaces are then condensed into a single
space.
-v Invert the sense of matching so that only articles not containg
a match are kept.
The regular expression should not be quoted. Any characters after the
options are considered part of the regular expression, although trailing
spaces are trimmed. Regular expressions that start with the "-"
character should start with "\-" instead.
Example Configuration:
feed 1h http://www.mysite.com/myfeed.rdf
grep -i dell monitor
feed 1h http://www.mysite.com/myfeed2.rdf
grep \b[Ii]nteresting\b|\bexciting\b
Limitations:
Only one regular expression can be specified per feed.
"""
import rawdoglib.rawdog
import rawdoglib.plugins
import re
__version__ = "1.0"
__author__ = "Steve Atwell <atwell@uiuc.edu>"
__date__ = "$Date: 2005-01-22 21:07:56 -0600 (Sat, 22 Jan 2005) $"
class _RECache:
def __init__(self):
self._cache = {}
def compile(self, pattern, flags=0):
try:
return self._cache[(pattern, flags)]
except KeyError:
compiled = re.compile(pattern, flags)
self._cache[(pattern, flags)] = compiled
return compiled
cache = _RECache()
stripre = re.compile(r'<.*?>|\n')
spacere = re.compile(r' +')
def grep(rawdog, config, article, ignore):
"""Handle new articles using the article_seen hook."""
global cache, stripre, spacere
ignore.value = False
feedargs = rawdog.feeds[article.feed].args
if feedargs.has_key("grep"):
reflags = re.U + re.S
invert = False
strip = False
# Parse options
grepline = feedargs["grep"].strip()
while grepline[0] == "-":
try:
(opt, grepline) = grepline.split(None, 1)
except ValueError:
raise rawdoglib.rawdog.ConfigError("feedgrep: missing regex for feed %s" % (article.feed,))
for o in opt[1:]:
if o == "i":
reflags += re.I
elif o == "v":
invert = True
elif o == "s":
strip = True
else:
raise rawdoglig.rawdog.ConfigError("feedgrep: bad option -%s for feed %s" % (o, article.feed))
ignore.value = True
grepre = cache.compile(grepline, reflags)
# Copy the text we will search so that we can modify
# it if the strip option is set
text = []
for piece in ["title", "summary"]:
if (article.entry_info.has_key(piece)):
text.append(article.entry_info[piece])
# Strip text. First replace HTML tags and newlines with
# spaces, and then condense multiples spaces into a
# single space.
if strip:
for i in range(len(text)):
text[i] = stripre.sub(' ', text[i])
text[i] = spacere.sub(' ', text[i])
for piece in text:
if (grepre.search(piece)):
ignore.value = False
break
if invert:
ignore.value = not ignore.value
# if we decided to ignore this, don't bother processing
# it further
return not ignore.value
return True
rawdoglib.plugins.attach_hook("article_seen", grep)