From 7772bdd8c10891a8ec4ab90a16bd0805c2afe93d Mon Sep 17 00:00:00 2001 From: Jon Saad-Falcon Date: Mon, 8 Jun 2020 15:04:14 -0400 Subject: [PATCH] Removed server.py, which is unnecessary for current release --- server.py | 620 ------------------------------------------------------ 1 file changed, 620 deletions(-) delete mode 100644 server.py diff --git a/server.py b/server.py deleted file mode 100644 index a985022..0000000 --- a/server.py +++ /dev/null @@ -1,620 +0,0 @@ -from flask import Flask, request, jsonify -from flask_cors import CORS -import pandas as pd -import numpy as np -from sklearn.feature_extraction.text import TfidfVectorizer -import random -starting_seed = random.seed(101) - - -# NLTK -import nltk -from nltk.tokenize import RegexpTokenizer -from nltk.stem.snowball import SnowballStemmer -from nltk.corpus import stopwords -import re -nltk.download('stopwords') - -from sklearn.cluster import KMeans -from sklearn.mixture import GaussianMixture -from scipy.spatial.distance import cdist -from sklearn.decomposition import PCA -from sklearn.preprocessing import normalize -from sklearn.metrics import pairwise_distances -from scipy.stats import multivariate_normal as mvn - - - -def cleanData(df, addKeywords, amountOfKeywords): - - # Perform quick count of the number of unique keywords - set_of_keywords= set() - for element in df.Keywords: - if type(element) == str: - for element2 in element.split("/"): - set_of_keywords.add(element2) - - #Remove all the numerals and turn everything into lowercase - - # Only uses titles right now - # Removing numerals: - - # If addKeywords is true, add each of the keywords of the researcher amountOfKeywords times - if addKeywords: - df['paper_text_tokens'] = df.Title.map(lambda x: re.sub(r'\d+', '', x)) + ' ' + df.Abstract.map(lambda x: re.sub(r'\d+', '', x)) - - for i in range(0, amountOfKeywords + 1): - df['paper_text_tokens'] = df['paper_text_tokens'] + ' ' + df.Keywords.map(lambda x: re.sub(r'\d+', '', str(x))) - else: - df['paper_text_tokens'] = df.Title.map(lambda x: re.sub(r'\d+', '', x)) + ' ' + df.Abstract.map(lambda x: re.sub(r'\d+', '', x)) - - - # Remove / from key words - df['paper_text_tokens'] = df.paper_text_tokens.map(lambda x: re.sub('/', ' ', x)) - - # Lower case: - df['paper_text_tokens'] = df.paper_text_tokens.map(lambda x: x.lower()) - - - - # Remove HTML tags - TAG_RE = re.compile(r'<[^>]+>') - def remove_tags(text): - return TAG_RE.sub('', text) - df['paper_text_tokens'] = df.paper_text_tokens.map(lambda x: remove_tags(x)) - - - - # Trim down abstracts that repeat themselves - df['paper_text_tokens'] = df.paper_text_tokens.map(lambda x: x[0:1250]) - - - # Tokenize the titles - df['paper_text_tokens'] = df.paper_text_tokens.map(lambda x: RegexpTokenizer(r'\w+').tokenize(x)) - - - - # Stem the titles to simplify the processing - snowball = SnowballStemmer("english") - df['paper_text_tokens'] = df.paper_text_tokens.map(lambda x: [snowball.stem(token) for token in x]) - - - - # remove any and all stop words to simplify processing - stop_en = stopwords.words('english') - df['paper_text_tokens'] = df.paper_text_tokens.map(lambda x: [t for t in x if t not in stop_en]) - - - - #Remove any extremely short words that could bias the processing - df['paper_text_tokens'] = df.paper_text_tokens.map(lambda x: [t for t in x if len(t) > 2]) - - - - # Re-combine all of the words together to form a clean title - df['paper_text_tokens']= df['paper_text_tokens'].str.join(" ") - - # Concatenate all an author's titles and abstracts together, get research areas, and get names - list_of_authors_works = [] - list_of_authors_research_areas = [] - list_of_authors_names = [] - list_of_authors_info = [] - i = 0 - while i < len(df['paper_text_tokens']): - next_string = '' - current_author = df.Author[i] - count = 0 - list_of_authors_research_areas.append(df.ResearchArea[i]) - list_of_authors_names.append(df.Author[i]) - - - next_info = {"Author": df.Author[i], "ResearchArea": df.ResearchArea[i].replace("_"," "), "URL": df.URL[i], - "KeyWords": str(df.Keywords[i]).replace("/",", ").replace("nan",""), - "Citations": df.Citations[i], "Affiliation": df.Affiliation[i].replace("'", "").replace("/",",")} - list_of_authors_info.append(next_info) - - while i + count < len(df['paper_text_tokens']) and current_author == df.Author[i + count]: - next_string = next_string + ' ' + df['paper_text_tokens'][i + count] - count = count + 1 - list_of_authors_works.append(next_string) - i = i + count - - - #Assign this list to pandas dataframe for analysis - authors_works = pd.Series(list_of_authors_works) - - - #Return the cleaned data set - return authors_works, list_of_authors_research_areas, list_of_authors_names, list_of_authors_info - - - - - -class GMM: - """ Gaussian Mixture Model - - Parameters - ----------- - k: int , number of gaussian distributions - - seed: int, will be randomly set if None - - max_iter: int, number of iterations to run algorithm, default: 200 - - Attributes - ----------- - centroids: array, k, number_features - - cluster_labels: label for each data point - - """ - def __init__(self, C, n_runs): - self.C = C # number of Guassians/clusters - self.n_runs = n_runs - self.seed = starting_seed - - - def get_params(self): - return (self.mu, self.pi, self.sigma) - - - - def calculate_mean_covariance(self, X, prediction): - """Calculate means and covariance of different - clusters from k-means prediction - - Parameters: - ------------ - prediction: cluster labels from k-means - - X: N*d numpy array data points - - Returns: - ------------- - intial_means: for E-step of EM algorithm - - intial_cov: for E-step of EM algorithm - - """ - d = X.shape[1] - labels = np.unique(prediction) - self.initial_means = np.zeros((self.C, d)) - self.initial_cov = np.zeros((self.C, d, d)) - self.initial_pi = np.zeros(self.C) - - counter=0 - for label in labels: - ids = np.where(prediction == label) # returns indices - self.initial_pi[counter] = len(ids[0]) / X.shape[0] - self.initial_means[counter,:] = np.mean(X[ids], axis = 0) - de_meaned = X[ids] - self.initial_means[counter,:] - Nk = X[ids].shape[0] # number of data points in current gaussian - self.initial_cov[counter,:, :] = np.dot(self.initial_pi[counter] * de_meaned.T, de_meaned) / Nk - counter+=1 - assert np.sum(self.initial_pi) == 1 - - return (self.initial_means, self.initial_cov, self.initial_pi) - - - - def _initialise_parameters(self, X): - """Implement k-means to find starting - parameter values. - https://datascience.stackexchange.com/questions/11487/how-do-i-obtain-the-weight-and-variance-of-a-k-means-cluster - Parameters: - ------------ - X: numpy array of data points - - Returns: - ---------- - tuple containing initial means and covariance - - _initial_means: numpy array: (C*d) - - _initial_cov: numpy array: (C,d*d) - - - """ - n_clusters = self.C - kmeans = KMeans(n_clusters= n_clusters, init="k-means++", max_iter=500, algorithm = 'auto') - fitted = kmeans.fit(X) - prediction = kmeans.predict(X) - self._initial_means, self._initial_cov, self._initial_pi = self.calculate_mean_covariance(X, prediction) - - - return (self._initial_means, self._initial_cov, self._initial_pi) - - - - def _e_step(self, X, pi, mu, sigma): - """Performs E-step on GMM model - Parameters: - ------------ - X: (N x d), data points, m: no of features - pi: (C), weights of mixture components - mu: (C x d), mixture component means - sigma: (C x d x d), mixture component covariance matrices - Returns: - ---------- - gamma: (N x C), probabilities of clusters for objects - """ - N = X.shape[0] - self.gamma = np.zeros((N, self.C)) - - const_c = np.zeros(self.C) - - - self.mu = self.mu if self._initial_means is None else self._initial_means - self.pi = self.pi if self._initial_pi is None else self._initial_pi - self.sigma = self.sigma if self._initial_cov is None else self._initial_cov - - for c in range(self.C): - # Posterior Distribution using Bayes Rule - self.gamma[:,c] = self.pi[c] * mvn.pdf(X, self.mu[c,:], self.sigma[c]) - - # normalize across columns to make a valid probability - gamma_norm = np.sum(self.gamma, axis=1)[:,np.newaxis] - self.gamma /= gamma_norm - - return self.gamma - - - def _m_step(self, X, gamma): - """Performs M-step of the GMM - We need to update our priors, our means - and our covariance matrix. - Parameters: - ----------- - X: (N x d), data - gamma: (N x C), posterior distribution of lower bound - Returns: - --------- - pi: (C) - mu: (C x d) - sigma: (C x d x d) - """ - N = X.shape[0] # number of objects - C = self.gamma.shape[1] # number of clusters - d = X.shape[1] # dimension of each object - - # responsibilities for each gaussian - self.pi = np.mean(self.gamma, axis = 0) - - self.mu = np.dot(self.gamma.T, X) / np.sum(self.gamma, axis = 0)[:,np.newaxis] - - for c in range(C): - x = X - self.mu[c, :] # (N x d) - - gamma_diag = np.diag(self.gamma[:,c]) - x_mu = np.matrix(x) - gamma_diag = np.matrix(gamma_diag) - - sigma_c = x.T * gamma_diag * x - self.sigma[c,:,:]=(sigma_c) / np.sum(self.gamma, axis = 0)[:,np.newaxis][c] - - return self.pi, self.mu, self.sigma - - - def _compute_loss_function(self, X, pi, mu, sigma): - """Computes lower bound loss function - - Parameters: - ----------- - X: (N x d), data - - Returns: - --------- - pi: (C) - mu: (C x d) - sigma: (C x d x d) - """ - N = X.shape[0] - C = self.gamma.shape[1] - self.loss = np.zeros((N, C)) - - for c in range(C): - dist = mvn(self.mu[c], self.sigma[c],allow_singular=True) - self.loss[:,c] = self.gamma[:,c] * (np.log(self.pi[c]+0.00001)+dist.logpdf(X)-np.log(self.gamma[:,c]+0.000001)) - self.loss = np.sum(self.loss) - return self.loss - - - - def fit(self, X): - """Compute the E-step and M-step and - Calculates the lowerbound - - Parameters: - ----------- - X: (N x d), data - - Returns: - ---------- - instance of GMM - - """ - - d = X.shape[1] - self.mu, self.sigma, self.pi = self._initialise_parameters(X) - - try: - for run in range(self.n_runs): - self.gamma = self._e_step(X, self.mu, self.pi, self.sigma) - self.pi, self.mu, self.sigma = self._m_step(X, self.gamma) - loss = self._compute_loss_function(X, self.pi, self.mu, self.sigma) - - if run % 10 == 0: - print("Iteration: %d Loss: %0.6f" %(run, loss)) - - - except Exception as e: - print(e) - - - return self - - - - - def predict(self, X): - """Returns predicted labels using Bayes Rule to - Calculate the posterior distribution - - Parameters: - ------------- - X: ?*d numpy array - - Returns: - ---------- - labels: predicted cluster based on - highest responsibility gamma. - - """ - labels = np.zeros((X.shape[0], self.C)) - - for c in range(self.C): - labels [:,c] = self.pi[c] * mvn.pdf(X, self.mu[c,:], self.sigma[c]) - labels = labels .argmax(1) - return labels - - def predict_proba(self, X): - """Returns predicted labels - - Parameters: - ------------- - X: N*d numpy array - - Returns: - ---------- - labels: predicted cluster based on - highest responsibility gamma. - - """ - post_proba = np.zeros((X.shape[0], self.C)) - - for c in range(self.C): - # Posterior Distribution using Bayes Rule, try and vectorise - post_proba[:,c] = self.pi[c] * mvn.pdf(X, self.mu[c,:], self.sigma[c]) - - return post_proba - - - - - - -#Generates a TFIDF Matrix with the corresponding set of max features from the given dataset -def generateTFIDFMatrix(dataset, maxfeatures): - tf_idf_vectorizor = TfidfVectorizer(stop_words = 'english', max_features = maxfeatures) - tf_idf = tf_idf_vectorizor.fit_transform(dataset) - tf_idf_norm = normalize(tf_idf) - output_array = tf_idf_norm.toarray() - return tf_idf, tf_idf_norm, output_array, tf_idf_vectorizor - - - - - - -# Perform Mixed Gaussian clustering on the inputted TFIDF array for the specified cluster number - -def performMixedGaussian(input_array, clusterNumber, withEllipses, withAnnotations, authors_names): - - - sklearn_pca_GMM = PCA(n_components = 2, random_state = starting_seed) - Y_sklearn_GMM = sklearn_pca_GMM.fit_transform(input_array) - - - #Other persons implementation - model = GMM(clusterNumber, n_runs = 40) - fitted_values = model.fit(Y_sklearn_GMM) - predicted_values = model.predict(Y_sklearn_GMM) - - # # compute centers as point of highest density of distribution - centers_other = np.zeros((clusterNumber,2)) - for i in range(model.C): - density_other = mvn(cov=model.sigma[i], mean=model.mu[i]).logpdf(Y_sklearn_GMM) - centers_other[i, :] = Y_sklearn_GMM[np.argmax(density_other)] - - - gmm = GaussianMixture(n_components=clusterNumber, covariance_type='full').fit(Y_sklearn_GMM) - prediction_gmm = gmm.predict(Y_sklearn_GMM) - probs = gmm.predict_proba(Y_sklearn_GMM) - - - centers = np.zeros((clusterNumber,2)) - for i in range(clusterNumber): - density = mvn(cov=gmm.covariances_[i], mean=gmm.means_[i]).logpdf(Y_sklearn_GMM) - centers[i, :] = Y_sklearn_GMM[np.argmax(density)] - - - return Y_sklearn_GMM, prediction_gmm - - - - -# Clean the topic vector so it can be vectorized -def createTopicVector(topic): - - - # Lower case: - topic = topic.lower() - - - - # Tokenize the titles - topic = RegexpTokenizer(r'\w+').tokenize(topic) - - - - # Stem the titles to simplify the processing - snowball = SnowballStemmer("english") - for i in range(0, len(topic)): - topic[i] = snowball.stem(topic[i]) - - - - # remove any and all stop words to simplify processing - stop_en = stopwords.words('english') - for i in range(0, len(topic)): - if topic[i] in stop_en: - topic.pop(i) - - - - - #Remove any extremely short words that could bias the processing - for i in range(0, len(topic)): - if len(topic[i]) <= 2: - topic.pop(i) - - - topic = ' '.join(topic) - - - #Return the cleaned data set - return topic - - -# Vectorizes topic according to specificed TFIDF vectorizer -def vectorizeTopic(dataset, topic, maxfeatures): - tf_idf_vectorizor = TfidfVectorizer(stop_words = 'english', max_features = maxfeatures) - - topic_series = pd.Series() - topic_series.set_value(0, topic) - dataset = pd.concat([dataset, topic_series]) - - tf_idf = tf_idf_vectorizor.fit_transform(dataset) - tf_idf_norm = normalize(tf_idf) - output_array = tf_idf_norm.toarray() - return tf_idf - - - - -# Generate similarity matrix from TFIDF matrix -def generateSimilarityMatrix(matrix): - similarity_matrix = matrix.dot(matrix.transpose()) - return similarity_matrix - - - - -# Find the top n similar researchers to the last column (topic) of the similarity matrix -def topNSimilarResearchers(matrix, number): - - similar_column = matrix.getcol(matrix.shape[1] - 1).toarray() - - - top_researchers = [] - for i in range(0, number): - - current_choice = 0 - while current_choice in top_researchers: - current_choice += 1 - for j in range(0, len(similar_column)): - if similar_column[j] > similar_column[current_choice]: - if top_researchers.count(j) == 0 and j != matrix.shape[1] - 1: - current_choice = j - - - top_researchers.append(current_choice) - - return top_researchers - - - -# Creates a JSON File that specifies the coloring of the vertices based on the queried term -# (e.g. Shows the top 5 researchers that score highest for 'machine learning') -def generateQueriedCoordinatesJSON(dataset, information, colors, ranking): - - json_list = [] - - ranking_list = [] - - for i in range(0, len(dataset)): - currentRank = -1 - if i in ranking: - currentRank = ranking.index(i) - next_connection = {"x0": dataset[i][0], "y0": dataset[i][1], "group": colors[i], "rank": currentRank} - next_connection.update(information[i]) - json_list.append(next_connection) - ranking_list.append( {"rank": currentRank} ) - - return ranking_list - - -# Generate a queried string JSON for PeopleMap based on query string, dataset, -# number of top choices, and number of Keywords - -def generateRankingJSON(query_string, number_of_top_picks, jsonName, df, numberOfKeywords): - - if numberOfKeywords >= 1: - clean_data, research_labels, authors_names, authors_info = cleanData(df, True, numberOfKeywords) - else: - clean_data, research_labels, authors_names, authors_info = cleanData(df, False, 0) - - # Generate TFIDF Matrix without the vector - tf_idf, tf_idf_norm, tf_idf_array, vectorizer = generateTFIDFMatrix(clean_data, 20000) - - # Perform standard clustering and positioning of researchers - Y_sklearn_output, groups = performMixedGaussian(tf_idf_array, 5, False, True, authors_names) - - # Create topic vector of the query term - topic_clean = createTopicVector(query_string) - - # Create a new matrix with the topic vector included - tf_idf_topic = vectorizeTopic(clean_data, topic_clean, 20000) - - # Generate a similarity matrix - similarity_matrix = generateSimilarityMatrix(tf_idf_topic) - - # Find the n top researchers for the queried string - top_researchers = topNSimilarResearchers(similarity_matrix, number_of_top_picks) - - # Creating final ranking list - ranking_list = generateQueriedCoordinatesJSON(Y_sklearn_output, authors_info, groups, top_researchers) - - return ranking_list - -df = pd.read_csv("ML_clean.csv") - -app = Flask(__name__) -CORS(app) - -@app.route("/", methods=['POST']) -def update_map(): - # Put inputted string query here - inputtedString = request.json["inputString"]# "Machine Learning" - - # Put number of top keywords here - numberOfTopKeywords = request.json["numKeywords"] - - # Put number of top choices here - numberOfTopChoices = request.json["numChoices"] - - return jsonify(generateRankingJSON(inputtedString, numberOfTopChoices, "rankingData", df, numberOfTopKeywords)) - -if __name__ == "__main__": - app.run(host='0.0.0.0', port='8000')