-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcorpusLibOverwrites.py
62 lines (53 loc) · 2.25 KB
/
corpusLibOverwrites.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#Library to be overwritten
from tmtoolkit.corpus import Corpus
#Used to make overwritten update method. needed for accurate progress bar
from typing import Dict, Union, Sequence, Iterator
from spacy.tokens import Doc
from tmtoolkit.corpus._document import Document
import logging
#To be overwritten to allow for output of progress
def textOutput(s : str):
pass
#copied from tmtoolkit.corpus._corpus.py
logger = logging.getLogger('tmtoolkit')
#used to overwrite tmtoolkit.corpus._corpus.py to add progressbar
def update(self, new_docs: Union[Dict[str, Union[str, Doc, Document]], Sequence[Document]]):
if isinstance(new_docs, Sequence):
new_docs = {d.label: d for d in new_docs}
logger.debug(f'updating Corpus instance with {len(new_docs)} new documents')
new_docs_text = {}
for lbl, d in new_docs.items():
textOutput("Loading " + lbl)
if isinstance(d, str):
new_docs_text[lbl] = d
else:
if isinstance(d, Doc):
d = self._init_document(d, label=lbl)
elif not isinstance(d, Document):
raise ValueError('one or more documents in `new_docs` are neither raw text documents, nor SpaCy '
'documents nor tmtoolkit Documents')
self._docs[lbl] = d
if new_docs_text:
self._init_docs(new_docs_text)
textOutput("Updating bimap")
self._update_bimaps(new_docs.keys())
textOutput("Updating workers")
self._update_workers_docs()
def _nlppipe(self, docs):
"""
Helper method to set up the SpaCy pipeline.
"""
if self.max_workers > 1: # pipeline for parallel processing
logger.debug(f'using parallel processing NLP pipeline with {self.max_workers} workers')
textOutput(f'using parallel processing NLP pipeline with {self.max_workers} workers')
return self.nlp.pipe(docs, n_process=self.max_workers)
else: # serial processing
logger.debug('using serial processing NLP pipeline')
total = str(len(docs))
ret : Iterator[Doc] = []
for txt in docs:
textOutput("Running NLP Pipeline on document " + str(len(ret)) + " of " + total)
ret.append(self.nlp(txt))
return ret
Corpus.update = update
Corpus._nlppipe = _nlppipe