-
Notifications
You must be signed in to change notification settings - Fork 13
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adding edges() and iteredges() Functions for DAWGs #1
base: master
Are you sure you want to change the base?
Changes from 8 commits
926d6e8
fa6cd76
8e7390a
0211c19
30bf53b
15355be
dee560c
2a93173
c94b4d8
8cb08f3
f3baac8
77f3802
ae7472a
4975f07
1207380
5462916
0b81a9f
2cbd340
f56e2b9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,29 +17,29 @@ def __init__(self): | |
"Root index" | ||
|
||
def has_value(self, index): | ||
"Checks if a given index is related to the end of a key." | ||
#Checks if a given index is related to the end of a key. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are comments better than docstrings? It is nice to have some docs available at runtime, e.g. in REPL. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fair. I changed it because PEP8 checks were complaining, and I never use docs at runtime, so forgot about that. I'm pretty neutral on this, so I'll change them back. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think using triple quotes and replacing "checks/gets/reads/..." with "check/get/read" should make them pep8-compatible. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The code currently isn't pep8 compatible anyway because some older lines go over 80 chars, etc... I'm thinking let's leave this as is for now, and I'll do a later run through to get everything pep8 compatible so the whole repo passes pep8 compatibility checks. |
||
return units.has_leaf(self._units[index]) | ||
|
||
def value(self, index): | ||
"Gets a value from a given index." | ||
#Gets a value from a given index. | ||
offset = units.offset(self._units[index]) | ||
value_index = (index ^ offset) & units.PRECISION_MASK | ||
return units.value(self._units[value_index]) | ||
|
||
def read(self, fp): | ||
"Reads a dictionary from an input stream." | ||
#Reads a dictionary from an input stream. | ||
base_size = struct.unpack(str("=I"), fp.read(4))[0] | ||
self._units.fromfile(fp, base_size) | ||
|
||
def contains(self, key): | ||
"Exact matching." | ||
#Exact matching. | ||
index = self.follow_bytes(key, self.ROOT) | ||
if index is None: | ||
return False | ||
return self.has_value(index) | ||
|
||
def find(self, key): | ||
"Exact matching (returns value)" | ||
#Exact matching (returns value) | ||
index = self.follow_bytes(key, self.ROOT) | ||
if index is None: | ||
return -1 | ||
|
@@ -48,7 +48,7 @@ def find(self, key): | |
return self.value(index) | ||
|
||
def follow_char(self, label, index): | ||
"Follows a transition" | ||
#Follows a transition | ||
offset = units.offset(self._units[index]) | ||
next_index = (index ^ offset ^ label) & units.PRECISION_MASK | ||
|
||
|
@@ -58,7 +58,7 @@ def follow_char(self, label, index): | |
return next_index | ||
|
||
def follow_bytes(self, s, index): | ||
"Follows transitions." | ||
#Follows transitions. | ||
for ch in s: | ||
index = self.follow_char(int_from_byte(ch), index) | ||
if index is None: | ||
|
@@ -95,16 +95,96 @@ def size(self): | |
return len(self._units) | ||
|
||
|
||
class EdgeFollower(object): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 for separating Completer and EdgeFollower |
||
def __init__(self, dic=None, guide=None): | ||
self._dic = dic | ||
self._guide = guide | ||
|
||
def value(self): | ||
"provides list of values at current index" | ||
|
||
if self._dic.has_value(self._cur_index): | ||
return self._dic.value(self._cur_index) | ||
return False | ||
|
||
def start(self, index, prefix=b""): | ||
"""initial setup for the next() action on some prefix. If there's a | ||
child for this prefix, we add that as the one item on the index_stack. | ||
Otherwise, leave the stack empty, so next() fails""" | ||
|
||
self.key = bytearray(prefix) | ||
self.base_key_len = len(self.key) | ||
self._parent_index = index | ||
self._sib_index = None | ||
self._cur_index = None | ||
if self._guide.size(): | ||
child_label = self._guide.child(index) # UCharType | ||
|
||
if child_label: | ||
# Follows a transition to the first child. | ||
next_index = self._dic.follow_char(child_label, index) | ||
if index is not None: | ||
self._sib_index = next_index | ||
self._cur_index = self._sib_index | ||
self.key.append(child_label) | ||
self.decoded_key = self.key.decode('utf8') | ||
return True | ||
|
||
def next(self): | ||
"Gets the next edge (not necessarily a terminal)" | ||
|
||
if not self._sib_index: | ||
return False | ||
|
||
sibling_label = self._guide.sibling(self._sib_index) | ||
self._sib_index = self._dic.follow_char(sibling_label, | ||
self._parent_index) | ||
self._cur_index = self._sib_index | ||
if not self._sib_index: | ||
return False | ||
|
||
self.key = self.key[:self.base_key_len] | ||
self.key.append(sibling_label) | ||
try: | ||
self.decoded_key = self.key.decode('utf8') | ||
except UnicodeDecodeError: | ||
#this sibling is a multibyte char. keep following its children til | ||
#something is decodable | ||
while True: | ||
child_label = self._guide.child(self._sib_index) | ||
self._cur_index = self._dic.follow_char(child_label, | ||
self._cur_index) | ||
if not self._cur_index: | ||
return False | ||
self.key.append(child_label) | ||
try: | ||
self.decoded_key = self.key.decode('utf8') | ||
break | ||
except UnicodeDecodeError: | ||
pass | ||
return True | ||
|
||
def get_cur_edge(self): | ||
"""helper method for getting the decoded key along with whether or not | ||
it is a terminal""" | ||
|
||
return (self.decoded_key, self._dic.has_value(self._cur_index)) | ||
|
||
|
||
class Completer(object): | ||
|
||
def __init__(self, dic=None, guide=None): | ||
self._dic = dic | ||
self._guide = guide | ||
|
||
def value(self): | ||
"provides list of values at current index" | ||
|
||
return self._dic.value(self._last_index) | ||
|
||
def start(self, index, prefix=b""): | ||
"initial setup for a completer next() action on some prefix" | ||
|
||
self.key = bytearray(prefix) | ||
|
||
if self._guide.size(): | ||
|
@@ -113,7 +193,6 @@ def start(self, index, prefix=b""): | |
else: | ||
self._index_stack = [] | ||
|
||
|
||
def next(self): | ||
"Gets the next key" | ||
|
||
|
@@ -153,7 +232,6 @@ def next(self): | |
|
||
return self._find_terminal(index) | ||
|
||
|
||
def _follow(self, label, index): | ||
next_index = self._dic.follow_char(label, index) | ||
if next_index is None: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think that .edges method should return the same data regardless of DAWG class. It it returns a list of strings in a base class it should return a list of strings in all subclasses.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For BytesDAWG it could make sense to filter out edges leading to the values.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's similar data for all. It never returns a list of strings. It always returns a list of 2-tuples. For dawgs with no data, the tuples are
(str, True)
for terminal edges and(str, False)
for non-terminals.For dawgs with data, they're
(str, data)
for terminal edges, and(str, False)
for non-terminals. Since data evaluates to true in a boolean situation, this seems most logical to me. If you want the data in an edge, you have it. If you want to just use the edges and know whether they're terminals or not, you can do that the same way across dawgs.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we really want them to be the same, we could make them return
(str, True)
for terminal edges always, and just add an extraedges_with_data()
method for dawgs that provide any kind of data storage. That actually seems most consistent to me. If you agree, I'll make that addition.