karpathy · Dabbrivia · Jan 26, 2020 · Jan 27, 2020 · Feb 10, 2020 · Feb 10, 2020
diff --git a/OAI_seed_db.py b/OAI_seed_db.py
@@ -0,0 +1,110 @@
+"""
+Queries arxiv OAI and downloads paper XML data.
+
+This script was adataped from another arXiv-metadata project on github. I 
+should cite them here, but I need to find the url again.
+"""
+
+import os
+import time
+import datetime
+import dateutil
+import pickle
+import random
+import argparse
+import urllib.request
+import re
+import requests
+
+from utils import Config, safe_pickle_dump
+from lxml import etree, objectify
+from parse_OAI_XML import parse_xml
+
+if __name__ == "__main__":
+
+  # parse input arguments
+  parser = argparse.ArgumentParser()
+  parser.add_argument('--set', type=str,
+          default='physics:cond-mat',
+                      #default='physics:hep-th',
+                      help='category used for arxiv OAI of form physics:arxivcat')
+  parser.add_argument('--from-date', type=str, default=datetime.date.isoformat(datetime.date.today()-datetime.timedelta(1)), help='Start date in YYYY-MM-DD')
+  parser.add_argument('--until-date', type=str, default=datetime.date.isoformat(datetime.date.today()), help='End date in YYYY-MM-DD, default is today')
+  args = parser.parse_args()
+
+  # misc hardcoded variables
+  resume_re = re.compile(r".*<resumptionToken.*?>(.*?)</resumptionToken>.*")
+  base_url = 'http://export.arxiv.org/oai2?' # base api query url
+  req = {u"verb": "ListRecords",
+           u"metadataPrefix": u"arXivRaw", u"set": args.set, u"from": args.from_date, u"until": args.until_date,}
+  print('Searching arXiv with query: '+str(req))
+
+  max_tries = 10
+
+  num_added_total = 0
+  failures = 0
+  count = 0
+  while True:
+     # Send the request.
+    r = requests.post(base_url, data=req)
+
+    # Handle the response.
+    code = r.status_code
+    print("Received Response Code:", code)
+
+    if code == 503:
+            # Asked to retry
+            to = int(r.headers["retry-after"])
+            print(u"Got 503. Retrying after {0:d} seconds.".format(to))
+
+            time.sleep(to)
+            failures += 1
+            if failures >= max_tries:
+                print(u"Failed too many times...")
+                break
+
+    elif code == 200:
+        failures = 0
+
+        # Write to file.
+        content = r.text
+        #print(content)
+        count += 1
+
+        #Save a backup of xml from arXiv in case screw up parsing (don't bother them too often)
+        file_name = u"raw"+datetime.date.isoformat(datetime.date.today())+"-{0:08d}.xml".format(count)
+        print(u"Writing to: {0}".format(file_name))
+        with open(file_name, u"w") as f:
+            f.write(content)
+
+        #Call a function from parse_xml.py to convert OAI-RAW to API format
+        parse_xml(file_name)
+        #num_added_total += num_added
+
+        # Look for a resumption token.
+        token = resume_re.search(content)
+        if token is None:
+            break
+        token = token.groups()[0]
+
+        # If there isn't one, we're all done.
+        if token == "":
+            print(u"All done.")
+            break
+
+        print(u"Resumption token: {0}.".format(token))
+
+        # If there is a resumption token, rebuild the request.
+        req = {u"verb": u"ListRecords",
+               u"resumptionToken": token}
+
+        # Pause so as not to get banned.
+        to = 20
+        print(u"Sleeping for {0:d} seconds so as not to get banned."
+                .format(to))
+        time.sleep(to)
+
+    else:
+        # Wha happen'?
+        r.raise_for_status()
+
diff --git a/README.md b/README.md
@@ -44,6 +44,13 @@ The processing pipeline requires you to run a series of scripts, and at this sta
 
 Optionally you can also run the `twitter_daemon.py` in a screen session, which uses your Twitter API credentials (stored in `twitter.txt`) to query Twitter periodically looking for mentions of papers in the database, and writes the results to the pickle file `twitter.p`.
 
+Structure of the `twitter.txt`:
+<pre>consumer_key
+consumer_secret
+access_token_key
+access_token_secret
+</pre>
+
 I have a simple shell script that runs these commands one by one, and every day I run this script to fetch new papers, incorporate them into the database, and recompute all tfidf vectors/classifiers. More details on this process below.
 
 **protip: numpy/BLAS**: The script `analyze.py` does quite a lot of heavy lifting with numpy. I recommend that you carefully set up your numpy to use BLAS (e.g. OpenBLAS), otherwise the computations will take a long time. With ~25,000 papers and ~5000 users the script runs in several hours on my current machine with a BLAS-linked numpy.
@@ -52,7 +59,7 @@ I have a simple shell script that runs these commands one by one, and every day
 
 If you'd like to run the flask server online (e.g. AWS) run it as `python serve.py --prod`.
 
-You also want to create a `secret_key.txt` file and fill it with random text (see top of `serve.py`).
+You also want to create a `secret_key.txt` file and fill it with random text (see top of `serve.py`). `cat /dev/urandom | base64 | head -c 1000 > secret_key.txt`
 
 ### Current workflow
 
@@ -67,6 +74,8 @@ python analyze.py
 python buildsvm.py
 python make_cache.py
 ```
+### Crontab entry
+```21 04 * * * . /home/ubuntu/.profile; echo "START $(date)">>/data/daily_update.log; /home/ubuntu/arxiv-sanity-preserver/daily_update.sh 2>&1 1>>/data/daily_update.log; "FINISH $(date)">>/data/daily_update.log; ```
 
 I run the server in a screen session, so `screen -S serve` to create it (or `-r` to reattach to it) and run:
 

diff --git a/analyze.py b/analyze.py
@@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
 """
 Reads txt files of all papers and computes tfidf vectors for all papers.
 Dumps results to file tfidf.p
@@ -9,50 +11,69 @@
 import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer
 
-from utils import Config, safe_pickle_dump
+from utils import Config, safe_pickle_dump, dir_basename_from_pid
+from joblib import Parallel, delayed
+
+import multiprocessing
+import pandas as pd
+import numpy as np
+from multiprocessing import Pool
+import scipy.sparse as sp
+
+import regex
 
 seed(1337)
-max_train = 5000 # max number of tfidf training documents (chosen randomly), for memory efficiency
+max_train = 25000 # max number of tfidf training documents (chosen randomly), for memory efficiency
 max_features = 5000
 
 # read database
 db = pickle.load(open(Config.db_path, 'rb'))
 
 # read all text files for all papers into memory
+
+def read_txt_path(p):      
+  with open(p, 'r') as f:
+    try: # some problems with unicode may arize
+      txt = f.read()
+    except:
+      txt = "" 
+  return txt
+
 txt_paths, pids = [], []
 n = 0
 for pid,j in db.items():
   n += 1
   idvv = '%sv%d' % (j['_rawid'], j['_version'])
-  txt_path = os.path.join('data', 'txt', idvv) + '.pdf.txt'
+
+  txt_path = os.path.join(Config.txt_dir, dir_basename_from_pid(pid,j)+".txt")
+
   if os.path.isfile(txt_path): # some pdfs dont translate to txt
-    with open(txt_path, 'r') as f:
-      txt = f.read()
+    txt = read_txt_path(txt_path)
+
     if len(txt) > 1000 and len(txt) < 500000: # 500K is VERY conservative upper bound
       txt_paths.append(txt_path) # todo later: maybe filter or something some of them
       pids.append(idvv)
-      print("read %d/%d (%s) with %d chars" % (n, len(db), idvv, len(txt)))
+      #print("read %d/%d (%s) with %d chars" % (n, len(db), idvv, len(txt)))
     else:
       print("skipped %d/%d (%s) with %d chars: suspicious!" % (n, len(db), idvv, len(txt)))
+      pass
   else:
     print("could not find %s in txt folder." % (txt_path, ))
 print("in total read in %d text files out of %d db entries." % (len(txt_paths), len(db)))
 
 # compute tfidf vectors with scikits
-v = TfidfVectorizer(input='content', 
-        encoding='utf-8', decode_error='replace', strip_accents='unicode', 
-        lowercase=True, analyzer='word', stop_words='english', 
+v = TfidfVectorizer(input='content',
+        encoding='utf-8', decode_error='replace', strip_accents='unicode',
+        lowercase=True, analyzer='word', stop_words='english',
         token_pattern=r'(?u)\b[a-zA-Z_][a-zA-Z0-9_]+\b',
-        ngram_range=(1, 2), max_features = max_features, 
+        ngram_range=(1, 2), max_features = max_features,
         norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True,
         max_df=1.0, min_df=1)
 
 # create an iterator object to conserve memory
 def make_corpus(paths):
   for p in paths:
-    with open(p, 'r') as f:
-      txt = f.read()
-    yield txt
+    yield read_txt_path(p)      
 
 # train
 train_txt_paths = list(txt_paths) # duplicate
@@ -62,12 +83,39 @@ def make_corpus(paths):
 train_corpus = make_corpus(train_txt_paths)
 v.fit(train_corpus)
 
+# export texts for topic modelling
+corpus = make_corpus(txt_paths) # don't forget to rewind
+pattern = regex.compile('((?=[^!?.,\ ])\W|\d)+', regex.UNICODE)
+clean_txt=(pattern.sub(' ',str(text)[:1000]) for text in corpus)
+texts_df=pd.DataFrame(clean_txt, columns=['Text',])
+texts_df.to_excel('diego_texts.xlsx',index=True)
+del corpus
+
+# https://github.com/rafaelvalero/ParallelTextProcessing/blob/master/parallelizing_text_processing.ipynb
+num_cores = multiprocessing.cpu_count()
+num_partitions = num_cores-1 if num_cores > 1 else 1 # I like to leave some cores for other processes
+print('num_partitions',num_partitions)
+
+#TODO we actually don't need a dataframe, transform corpus to np.array directly  
+def parallelize_dataframe(df, func):
+    a = np.array_split(df, num_partitions)
+    del df
+    pool = Pool(num_partitions)
+    sparse_mtrx = sp.vstack(pool.map(func, a), format='csr')
+    pool.close()
+    pool.join()
+    return sparse_mtrx
+
+def transform_func(data):
+    tfidf_matrix = v.transform(data["text"])
+    return tfidf_matrix
+
 # transform
 print("transforming %d documents..." % (len(txt_paths), ))
 corpus = make_corpus(txt_paths)
-X = v.transform(corpus)
-print(v.vocabulary_)
-print(X.shape)
+data_pd = pd.DataFrame(corpus)
+data_pd.rename(columns = {0:'text'},inplace = True)
+X = parallelize_dataframe(data_pd, transform_func)
 
 # write full matrix out
 out = {}
@@ -83,12 +131,10 @@ def make_corpus(paths):
 out['ptoi'] = { x:i for i,x in enumerate(pids) } # pid to ix in X mapping
 print("writing", Config.meta_path)
 safe_pickle_dump(out, Config.meta_path)
+del out
+del data_pd
 
-print("precomputing nearest neighbor queries in batches...")
-X = X.todense() # originally it's a sparse matrix
-sim_dict = {}
-batch_size = 200
-for i in range(0,len(pids),batch_size):
+def compute_batch(i):
   i1 = min(len(pids), i+batch_size)
   xquery = X[i:i1] # BxD
   ds = -np.asarray(np.dot(X, xquery.T)) #NxD * DxB => NxB
@@ -97,5 +143,13 @@ def make_corpus(paths):
     sim_dict[pids[i+j]] = [pids[q] for q in list(IX[:50,j])]
   print('%d/%d...' % (i, len(pids)))
 
+
+print("precomputing nearest neighbor queries in batches...")
+X = X.todense().astype(np.float32) # originally it's a sparse matrix
+sim_dict = {}
+batch_size = 200
+Parallel( n_jobs=-1, prefer="threads", verbose=5)(
+    delayed(compute_batch)(i) for i in range(0,len(pids),batch_size))
+
 print("writing", Config.sim_path)
 safe_pickle_dump(sim_dict, Config.sim_path)
diff --git a/dabbrivia_list_db.py b/dabbrivia_list_db.py
@@ -0,0 +1,21 @@
+"""
+Reads txt files of all papers and prints out their filenames.
+"""
+import os
+import pickle
+
+from utils import Config, safe_pickle_dump
+
+# read database
+db = pickle.load(open(Config.db_path, 'rb'))
+
+# read all text files for all papers into memory
+txt_paths, pids = [], []
+for pid,j in db.items() :
+  if j['_rawid'][:4].isdigit() and '.' in j['_rawid']: 
+      print(j['_rawid'][:4]+'/'+j['_rawid']+'.pdf')
+  elif '/' in j['_rawid']:
+      print(j['_rawid'].split("/")[1][:4]+'/'+"".join(j['_rawid'].split("/"))+'.pdf')
+  else: 
+      print(j['_rawid'][:4]+'/'+j['arxiv_primary_category']['term'].split(".")[0]+j['_rawid']+'.pdf')
+