#
# Classifies all the files specified by an input argument.
#
# Uses a classifier trained with the text_topic_classifier.py script.
#
# Example run 0: python batch_classify.py -h
#
# Example run 1: python batch_classify.py -input "test-248-only-body-papers/*" -clf classifiers/Bagging.pkl
#
# Example run 2: python batch_classify.py -input "test-248-only-body-papers/*" -clf classifiers/Bagging.pkl -pos_dir pos_files -neg_dir neg_files
#
# This software is released under the GNU General Public License (GPL) v3.
#
# Written by Kamran Karimi
#

import os, argparse, glob, joblib, sys, shutil

parser = argparse.ArgumentParser()
parser.add_argument("-input", help = "Files to be classified")
parser.add_argument("-clf", help = "The classifier to use, as a .pkl file")
parser.add_argument("-pos_file", default = 'positive_verdicts.txt', help = "File to record positive verdict names")
parser.add_argument("-neg_file", default = 'negative_verdicts.txt', help = "File to record negative verdict names")
parser.add_argument("-pos_dir", nargs="?", help = "Directory to contain positive verdict files")
parser.add_argument("-neg_dir", nargs="?", help = "Directory to contain negative verdict files")

args = parser.parse_args()

if args.clf:
	print ("loading the classifier... ", end="")
	clf = joblib.load(args.clf)
	print ("done")
else:
	print ("No classifier specified. Exiting.")
	sys.exit(-1)
if args.input:
	path=args.input
else:
	print("No input files to classify specified. Exiting.")
	sys.exit(-1)
	
if args.pos_file:
	pos = open(args.pos_file, 'w')
if args.neg_file:
	neg = open(args.neg_file, 'w')

if args.neg_dir:
	if not os.path.exists(args.neg_dir):
		os.makedirs(args.neg_dir)

if args.pos_dir:
	if not os.path.exists(args.pos_dir):
		os.makedirs(args.pos_dir)

files=glob.glob(path)   
num_read = 0
num_pos = 0
num_neg = 0

print ("processed papers:")
for file in files:
	name = os.path.basename(file)
	input_data = []
	f=open(file, 'r', encoding="utf8")  
	input_data.append(f.read())
	num_read = num_read + 1
	f.close() 
	
	if num_read % 10 == 0:
		print(str(num_read), end='\r')
		sys.stdout.flush()
	predicted = clf.predict(input_data)
	if int(predicted) == 1:
		pos.write("%s\n" % name)
		if args.pos_dir:
			shutil.copyfile(file, args.pos_dir + "/" + name)
		num_pos = num_pos + 1
	elif int(predicted) == 0:
		neg.write("%s\n" % name)
		if args.neg_dir:
			shutil.copyfile(file, args.neg_dir + "/" + name)
		num_neg = num_neg + 1
	else:
		print("Error! unknown class: ", predicted)
 

print(str(num_read), end='\r')
sys.stdout.flush()
pos.close()
neg.close()
print()
print("Classified " + str(num_read) + " files. ", num_pos, "were categorized as positive and", num_neg, " were categorized as negative")







	
	