#
# This code trains a number of sklearn classifiers to sort text into relevant and irrlevant groups.
# Requires Python 3 and Scikit-learn
#
# The example training and test data provided in the package are for distinguishing between papers about echinoderms, and papers not
# relevant to echinoderms. The title and abstract of relevant and irrelevant papers are sourced from NCBI's PubMed and saved
# as individual files.
# 
# Echinoderm positive training files: training-relevant-papers/
# Echinoderm negative training files: training-irrelevant-papers/
#
# Echinoderm positive test files: test-relevant-papers/
# Echinoderm negartive test files: test-irrelevant-papers/
#
# The following are the papers with no mention of echinoderms in the title or abstract, and presumed to
# be mostly negative files from PubMed: test-248-only-body-papers/ 
#
#
# Example 0: Display command line options: 
# python3 text_topic_classifier.py -h
#
# Example 1: Generate classifiers from echinoderm training data without saving the classifiers:
# python3 text_topic_classifier.py -train_pos "training-relevant-papers/*" -train_neg "training-irrelevant-papers/*"
#
# Example 2: Generate classifiers from echinoderm training data and use them on echinoderm test data. Classifiers aren't saved:
# python3 text_topic_classifier.py -train_pos "training-relevant-papers/*" -train_neg "training-irrelevant-papers/*" -test_pos "test-relevant-papers/*" -test_neg "test-irrelevant-papers/*"
#
# Example 3: Generate classifiers from echinoderm training data and use them on mostly negative echinoderm test data. Classifiers aren't saved:
# python3 text_topic_classifier.py -train_pos "training-relevant-papers/*" -train_neg "training-irrelevant-papers/*" -test_neg "test-248-only-body-papers/*"
#
# Example 4: Generate classifiers from echinoderm training data and save the generated classifiers to the specified folder. The classifiers can be used later. For an example see LiteratureLoader.java
# Example run 4: python3 text_topic_classifier.py -train_pos "training-relevant-papers/*" -train_neg "training-irrelevant-papers/*" -cv 10 -save_clf -clf_directory created_classifers
#
#
# This software is released under the GNU General Public License (GPL) v3.
#
# Written by Kamran Karimi
#

import sys, os, argparse, numpy, glob, joblib

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.model_selection import cross_validate
# List of classifiers to use
from sklearn.linear_model import RidgeClassifier, SGDClassifier, PassiveAggressiveClassifier, LogisticRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, ComplementNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier, BaggingClassifier

numpy.set_printoptions(formatter={'float': lambda x: "{0:3.1f}".format(x)})

train_data = []
train_classes = []
train_names = []
positive_trains = 0
negative_trains = 0

test_data = []
test_classes = []
test_names = []
positive_tests = 0
negative_tests = 0

# Cross validation
myCV = 20

# save classifiers
save_clfs = False
clf_directory = "classifiers"

verbose = False

test_mismatch = False

def read_data_class(data, classes, path, class_value, file_names):
	files=glob.glob(path)   
	num_read = 0
	
	for file in files:     
		f=open(file, 'r', encoding="utf8")  
		data.append(f.read())
		classes.append(class_value)
		if file_names is not None:
			file_names.append(os.path.basename(file))
		num_read = num_read + 1
		f.close() 
	
	return num_read

# set up the args
parser = argparse.ArgumentParser()
parser.add_argument("-train_pos", help = 'Positive training text files (e.g. "training-relevant-papers/*"). Required.')
parser.add_argument("-train_neg", help = 'Negative training text files (e.g. "training-irrelevant-papers/*)". Required.')
parser.add_argument("-test_pos", help = "Positive test text files. Optional.")
parser.add_argument("-test_neg", help = "Negative test text files. Optional.")
parser.add_argument("-cv", default = myCV, help = "Number of cross validations on training data. Default is " + str(myCV))
parser.add_argument("-save_clf", action="store_true", default = save_clfs, help="Save the classifier(s) as .pkl file(s). Default is " + str(save_clfs))
parser.add_argument("-clf_directory", default = clf_directory, help = "Directory to save the classifier(s) in .pkl format. Default is " + clf_directory)
parser.add_argument("-verbose", action="store_true", default = verbose, help="Prints out more information to the console. Default is " + str(verbose))
parser.add_argument("-test_mismatch", action="store_true", default = test_mismatch, help="Prints out test mistakes (cases with a different prediction than the provided class). Default is " + str(test_mismatch))

args = parser.parse_args()

if args.cv:
	myCV = int(args.cv)	
if args.train_pos:
	positive_trains = positive_trains + read_data_class(train_data, train_classes, args.train_pos, 1, train_names)
if args.train_neg:
	negative_trains = negative_trains + read_data_class(train_data, train_classes, args.train_neg, 0, train_names)	
if args.test_pos:		
	positive_tests = positive_tests + read_data_class(test_data, test_classes, args.test_pos, 1, test_names)
if args.test_neg:
	negative_tests = negative_tests + read_data_class(test_data, test_classes, args.test_neg, 0, test_names)	
if args.save_clf:
	save_clfs = True
if args.verbose:
	verbose = True
if args.test_mismatch:
	test_mismatch = True
if args.clf_directory:
	clf_directory = args.clf_directory

print()
print("Starting text_topic_classifier. For detailed information please refer to the paper 'Classifying domain-specific text documents containing ambiguous keywords', Database (Oxford), https://doi.org/10.1093/database/baab062")
print()
print("Please note that some classifiers may need a longer time to run. You can edit this Python file to add or remove classifiers.")
print()

# verify valid input exists
if positive_trains == 0 or negative_trains == 0:
	print ("Error: Missing positive and negative training data. See this Python file for example runs. For help use the -h option. Exiting.")
	sys.exit(-1)

print("\nread " + str(len(train_data)) + " training documents, " + str(positive_trains) + " positives and " + str(negative_trains) + " negatives" )
if verbose == True:
	print("Training data classes: " , train_classes)
print()

if positive_tests + negative_tests > 0:
	print("read " + str(len(test_data)) + " test documents, " + str(positive_tests) + " positives and " + str(negative_tests) + " negatives" )
	if verbose == True:
		print("Test data classes: " , test_classes)
	print()

# set up the classifiers. You can add and remove (comment out) any classfiers here. Be sure to make corresponding changes in the clf_pipelines variable.
rc = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('clf', RidgeClassifier(random_state=0)),])

sgd = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('clf', SGDClassifier(random_state=0)),])

pa = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('clf', PassiveAggressiveClassifier(max_iter=2000, random_state=0)),]) 

lr = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('clf', LogisticRegression(solver='lbfgs', random_state=0)),])

mnb = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('clf', MultinomialNB()),])

cnb = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('clf', ComplementNB()),])

bnb = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('clf', BernoulliNB()),])

tree = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('clf', DecisionTreeClassifier(random_state=0)),])

forest = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('clf', RandomForestClassifier(n_estimators=200, random_state=0, n_jobs = -1)),])

bag = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('clf', BaggingClassifier(estimator=DecisionTreeClassifier(criterion='entropy'),n_estimators=200, random_state=0, n_jobs=-1))])

knn = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('clf', KNeighborsClassifier()),])

ab = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('clf', AdaBoostClassifier(estimator=DecisionTreeClassifier(), n_estimators=200, random_state=0)),])

svc = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('clf', SVC(kernel='linear', gamma='auto', probability=True, random_state=0)),])

mlp = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('clf', MLPClassifier(solver='lbfgs', random_state=0)),])

voters = VotingClassifier(estimators=[('mnb', mnb), ('cnb', cnb),('bnb', bnb), ('pa', pa), ('rc', rc), ('lr', lr), ('sgd', sgd), ('tree', tree), ('forest', forest), ('svc', svc), ('mlp', mlp), ('ab', ab), ('bag', bag)], n_jobs = -1)


# classifiers in this list will be run. They must have been created above.
clf_pipelines = [ \
  ("Ridge Classifier", rc),\
  ("Stochastic Gradient Descent", sgd), \
  ("Passive Agressive", pa), \
  ("Logistic Regression", lr), \
  ("Multinomial NB", mnb), \
  ("Complemenat NB", cnb), \
  ("Bernoulli NB", bnb), \
  ("Decision Tree", tree), \
  ("Random Forest", forest), \
  ('Bagging', bag), \
  ("K Nearest Neighbor", knn),  \
  ("Ada Boost", ab), \
  ("SVC", svc), \
  ("MultiLayer Percepteron", mlp), \
  ('Voters', voters)]

# A shorter list of classfiiers to try
#clf_pipelines = [('Bagging', bag), ("RidgeClassifier", rc)]


for clf_name,classifier in clf_pipelines:
	print(clf_name + " ", end='', flush=True)
	scores = cross_validate(classifier, train_data, train_classes, cv=myCV, scoring = {'accuracy': 'accuracy', 'precision': 'precision', 'recall':'recall'}, n_jobs=-1)
	print(str(myCV) + "-fold training accuracy: %3.1f%% accuracy with a standard deviation of %0.3f" % (scores['test_accuracy'].mean() * 100, scores['test_accuracy'].std() * 100))
	if verbose == True:
		print(clf_name + " " + str(myCV) + "-fold training precision: %3.1f%%, with a standard deviation of %0.3f" % (scores['test_precision'].mean() * 100, scores['test_precision'].std() * 100))
		print(clf_name + " " + str(myCV) +"-fold training recall: %3.1f%%, with a standard deviation of %0.3f" % (scores['test_recall'].mean() * 100, scores['test_recall'].std() * 100))

	# fit the training data
	classifier.fit(train_data, train_classes)
	
	# save the classifier if requested
	if save_clfs:
		if not os.path.exists(clf_directory):
			os.makedirs(clf_directory)
		joblib.dump(classifier, clf_directory + "/" + clf_name + ".pkl")
  
	# run tests if test data provided
	if positive_tests + negative_tests > 0:
		test_predicted = classifier.predict(test_data)
		(precision, recall, fscore, support) = precision_recall_fscore_support(test_classes, test_predicted, zero_division=0 )
		accuracy = accuracy_score(test_classes, test_predicted, normalize=False)
		print(clf_name + " test accuracy: %3.1f%% (%s/%s)" %(numpy.mean(test_classes == test_predicted) * 100, accuracy, str(len(test_classes)) ))
		print(clf_name + " test precision: ", str(precision * 100) + "%, recall: " + str(recall * 100) + "%, fscore: " + str(fscore * 100) + "%, support: ", str(support))
		print()
	  
		if test_mismatch == True:
			for i in range(len(test_predicted)):
				if test_predicted[i] != test_classes[i]:
					print (test_names[i], "is predicted as " + str(test_predicted[i]) + ", provided class was " + str(test_classes[i]))

	print("-------")
