#There many Python based modules for extraction of text from a PDF file. e.g.
#  --- PyPDF2, PDFPLUMBER, PDFMINER, PyMuPDF, TEXTRACT  ---
#Each of these have some shortcomings. Some may not work on PDF with subset
#fonts ... some may add newline after every word or few words....
#------------------------------------------------------------------------------
#Ref Starting Point - https://gist.github.com/PBPatil
#------------------------------------------------------------------------------
#NLP Steps: Read text -> Split it into sentences -> Split into words ->Remove 
# punctuations (e.g. dot) -> Remove symbols (such as copyright) -> Stemming
# or Lemmatization -> Compute work frequencies ->Summarize word statistics
#
#--------------------------CONCEPTS--------------------------------------------
#Tokenization: process of breaking the text, into smaller units called tokens
#Stemming: process to extract the base forms of the words - e.g. reading ->read
#Lemmatization: another way to extract the base form of words
#
#--------------------------USER INPUT------------------------------------------
#Specify input file - locate in same folder as this Python Script. The PDF
# should be clean in terms of character - copy the text and paste in Notepad
# to check the legibility of imported text. 
fileName = '13_Delhi-Diary-Mohammad-Gandhi.pdf'
firstPg = 1    #First page to extract
lastPg = 400   #Last page to extract
step = 1       #Step size from first to last page
#
#-------------------------IMPORT MODULES--------------------------------------
#Import Pandas and NumPy libraries for handling data tables
import pandas as pd
import numpy as np
import sys
import os

#Import pyPDF2 to extract texts from PDF files
import PyPDF2
import textract

#Import REGEX library: REGular EXpression - short cut to select characters
import re
import nltk
#Import module to divide the input text into sentences
from nltk.tokenize import sent_tokenize

#Import module to divide the input text into words
from nltk.tokenize import word_tokenize

#RAKE (Rapid Automatic Keyword Extraction)
#from rake_nltk import Rake

#Stop words are a pre-defined list of items for ease of use. It can be modifed -
# new words can be added to removed with append(), remove() or union() methods
from nltk.corpus import stopwords
stopWd = stopwords.words('english')

#Import stopwords with scikit-learn
#from sklearn.feature_extraction import text
#stopWd = text.ENGLISH_STOP_WORDS

#Import stopwords from GenSim library
#from gensim.parsing.preprocessing import remove_stopwords
#stopWd = STOPWORDS.union(set(['nuWords1'], ['nuWords2']))
#
#----------------------------ADD ADDITIONAL STOP WORDS--------------------------
#For demonstration only - the words not in standard English using NLTK library
#These words have been listed to find out a method to restore the original words
#Most of the time, these stray words are prefixes and suffixes of standard words
#
#These words are not defined in built-in libraries OR get generated during bad
#extraction of words. Sometimes, 'to' is extract as 'ro', 'If' as 'lf'
nuWrds01=('let','us','ir','tt','tf','even','ever','one','two', 'could', 'would')
nuWrds02=('should','dr','es','te','al','th','ee', 'sa','im','il','ig','un','ie')
nuWrds03=('av','id','rs','go','lo','fo','mo','wo','ab','ot','ob','oo','os','ts')
nuWrds04=('per','ru','lf','oy','oe','xi','ix','vi','vii','viii','xiv','xv','mr')
nuWrds05=('xvi','xvii','xviii','xix','xx','tp','unto','throughout','midst','ti')
nuWrds06=('re','co','ib','cg','st','ar','li','ea','se','ra','la','ic','ag','lt')
nuWrds07=('ing','ted','ist','ate','est', 'ere','less','ment','pro','ove','ance')
nuWrds08=('ice','nde','ity','tte','pre','con','res','end','tat','ah','ort','op')
nuWrds09=('ont','ning','ees','ven','han','ass','af','cc','ean','gee','ite','bu')
nuWrds10=('ame','ree','ful','ene','eve','com','nal','ber','sse','tar','gn','ok')
nuWrds11=('pea','ndi','hin','ther','dia','ting','nion','pak','ard','indi','ard')
nuWrds12=('ini','led','ang','tor','ave','rot','tic','ery','wit','ding','ell')
nuWrds13=('wan','goo','pur','sar','orr','ahe','arn','ona','tre','hap','cor','bs')
nuWrds14=('tis','het','sible','dom','alt','cen','che','dre','ets','tary','los')
stopWd.extend(nuWrds01); stopWd.extend(nuWrds02)
stopWd.extend(nuWrds03); stopWd.extend(nuWrds04)
stopWd.extend(nuWrds05); stopWd.extend(nuWrds06)
stopWd.extend(nuWrds07); stopWd.extend(nuWrds08)
stopWd.extend(nuWrds09); stopWd.extend(nuWrds10)
stopWd.extend(nuWrds11); stopWd.extend(nuWrds12)
stopWd.extend(nuWrds13); stopWd.extend(nuWrds14)
#------------------------------------------------------------------------------

#open allows you to read the file
pdff = open(fileName,'rb')           
nPg = PyPDF2.PdfFileReader(pdff).numPages

i = firstPg
lastPg = min(lastPg, nPg)
text = ""
#------------------------------------------------------------------------------
'''
import pdfplumber
pdff = pdfplumber.open(fileName)

# Iterate over all the pages - pdfplumber
while i < lastPg:
    pgTxt = pdff.pages[i].extract_text()
    i = i + 1
    text  = text + pgTxt
'''
#------------------------------------------------------------------------------
#PyMuPDF
import fitz
pdff = fitz.open(fileName)
for pg in pdff.pages(firstPg, lastPg, step):
    text = text + pg.getText()             #Extract page
    #'text' =  variable which contains all the text derived from our PDF file
    #
    text = text.replace('\n', ' ')         #Replace newline characters
    text = text.replace('\r', ' ')
    text = text.replace("- ", '')          #Remove hyphens
    text = re.sub("[\w]-[\w]", '', text)
    text = re.sub(" [\w][\w] ", ' ', text) #Removes words of 1/2 characters
    text = re.sub(" [\w] ", ' ', text)
    #Remove words formed due to hyphenation
    text = re.sub(r"\b[\W]ing", "ing", text)
    text = re.sub(r"\b[\W]tion", "tion", text)
    text = re.sub(r"\b[\W]ment", "ment", text)
#------------------------------------------------------------------------------
'''
#The while loop will read each page - PyPDF2
#The pdfReader variable is a readable object that will be parsed. Functionality
#will not work well for some PDF files, e.g.  file containing subsetted fonts
pdfReader = PyPDF2.PdfFileReader(pdf)

#discerning the number of pages will allow us to parse through all the pages
nPages = pdfReader.numPages 
while i < lastPg:                       
    pageObj = pdfReader.getPage(i)
    pgTxt = pageObj.extractText()
    text  = text + pgTxt
    i = i + 1
'''
#
text = text.replace('\x0c','')             #Remove tabs
text = text.replace('\xa9','')             #Remove Copyright symbols
text = re.sub(r'\b([0-9])+\w', '', text)   #Remove words starting a number

#Change to Lower case each word
text = text.lower()
#------------------------------WRITE EXTRACTED TEXT----------------------------
f = open("textExtracted.txt", "w"); f.write(text); f.close()
#

#REGEX examples: search string and replace unwanted characters
#Special characters: \ followed by  a, A, b, B, d, D, s, S, w, W or Z
cleanedTxt = re.findall(r'\b[a-zA-Z]\w+', text)
trimmedTxt = ' '.join([str(wd) for wd in cleanedTxt])
#f = open("cleanedTxt.txt", "w"); f.write(trimmedTxt); f.close()

#Remove stray words - for demonstration
trimmedTxt = re.sub(r"\b ing ", '', trimmedTxt)
trimmedTxt = re.sub(r"\b tion", '', trimmedTxt)
trimmedTxt = re.sub(r"\b ment", '', trimmedTxt)

#Remove stray words - using NLTK library
#nltk.download('words') #if error = Resource words not found
words = set(nltk.corpus.words.words())
" ".join(w for w in nltk.wordpunct_tokenize(trimmedTxt))

#Remove stop words from input text: tokenize (remove) stop words -> join words
txtTokens = word_tokenize(trimmedTxt)
txtNoStopWd = [word for word in txtTokens if not word in stopWd]

#Create the string out of words trimmed by removing stop words
finalText = ' '.join(txtNoStopWd)
#f = open("finalTxt.txt", "w"); f.write(finalText); f.close()

'''
with open("Keywords.txt", "w") as outfile:
    outfile.write("\n".join(str(item) for item in itemlist))
'''
#------------------------------------------------------------------------------
#Dataframe with unique keywords to avoid repetition in rows
keywords = re.findall(r'[a-zA-Z]\w+', finalText)
f = open("Keywords.txt", "w")
for Wrd in keywords:
    f.write("%s\n" % Wrd)
f.close()

df = pd.DataFrame(list(set(keywords)),columns=['keywords'])
#np.savetxt(r'dataFrame.txt', df.values, fmt='%s', delimiter=',')

def weightage(word, y):
    wordList = re.findall(word, y)
    wordFreq =len(wordList)
    #Calculate Term Frequency
    tf = wordFreq/float(len(y))
    #Calculate Inverse Document Frequency
    idf = np.log(1/float(wordFreq))
    tf_idf = tf*idf
    #Round-off to 4 digits after decimal place
    tf = "{0:8.4f}".format(tf)
    idf = "{0:8.4f}".format(idf)
    tf_idf = "{0:8.4f}".format(tf_idf)
    return wordFreq, tf,idf ,tf_idf
#------------------------------------------------------------------------------
#Print summary of words
df['wordFreq'] = df['keywords'].apply(lambda x: weightage(x, finalText)[0])
df['tf'] = df['keywords'].apply(lambda x: weightage(x, finalText)[1])
df['idf'] = df['keywords'].apply(lambda x: weightage(x, finalText)[2])
df['tf_idf'] = df['keywords'].apply(lambda x: weightage(x, finalText)[3])

df = df.sort_values('wordFreq', ascending=False)
df.to_csv('Keywords.csv')
print(df.head(25))