#There many Python based modules for extraction of text from a PDF file. e.g. # --- PyPDF2, PDFPLUMBER, PDFMINER, PyMuPDF, TEXTRACT --- #Each of these have some shortcomings. Some may not work on PDF with subset #fonts ... some may add newline after every word or few words.... #------------------------------------------------------------------------------ #Ref Starting Point - https://gist.github.com/PBPatil #------------------------------------------------------------------------------ #NLP Steps: Read text -> Split it into sentences -> Split into words ->Remove # punctuations (e.g. dot) -> Remove symbols (such as copyright) -> Stemming # or Lemmatization -> Compute work frequencies ->Summarize word statistics # #--------------------------CONCEPTS-------------------------------------------- #Tokenization: process of breaking the text, into smaller units called tokens #Stemming: process to extract the base forms of the words - e.g. reading ->read #Lemmatization: another way to extract the base form of words # #--------------------------USER INPUT------------------------------------------ #Specify input file - locate in same folder as this Python Script. The PDF # should be clean in terms of character - copy the text and paste in Notepad # to check the legibility of imported text. fileName = '13_Delhi-Diary-Mohammad-Gandhi.pdf' firstPg = 1 #First page to extract lastPg = 400 #Last page to extract step = 1 #Step size from first to last page # #-------------------------IMPORT MODULES-------------------------------------- #Import Pandas and NumPy libraries for handling data tables import pandas as pd import numpy as np import sys import os #Import pyPDF2 to extract texts from PDF files import PyPDF2 import textract #Import REGEX library: REGular EXpression - short cut to select characters import re import nltk #Import module to divide the input text into sentences from nltk.tokenize import sent_tokenize #Import module to divide the input text into words from nltk.tokenize import word_tokenize #RAKE (Rapid Automatic Keyword Extraction) #from rake_nltk import Rake #Stop words are a pre-defined list of items for ease of use. It can be modifed - # new words can be added to removed with append(), remove() or union() methods from nltk.corpus import stopwords stopWd = stopwords.words('english') #Import stopwords with scikit-learn #from sklearn.feature_extraction import text #stopWd = text.ENGLISH_STOP_WORDS #Import stopwords from GenSim library #from gensim.parsing.preprocessing import remove_stopwords #stopWd = STOPWORDS.union(set(['nuWords1'], ['nuWords2'])) # #----------------------------ADD ADDITIONAL STOP WORDS-------------------------- #For demonstration only - the words not in standard English using NLTK library #These words have been listed to find out a method to restore the original words #Most of the time, these stray words are prefixes and suffixes of standard words # #These words are not defined in built-in libraries OR get generated during bad #extraction of words. Sometimes, 'to' is extract as 'ro', 'If' as 'lf' nuWrds01=('let','us','ir','tt','tf','even','ever','one','two', 'could', 'would') nuWrds02=('should','dr','es','te','al','th','ee', 'sa','im','il','ig','un','ie') nuWrds03=('av','id','rs','go','lo','fo','mo','wo','ab','ot','ob','oo','os','ts') nuWrds04=('per','ru','lf','oy','oe','xi','ix','vi','vii','viii','xiv','xv','mr') nuWrds05=('xvi','xvii','xviii','xix','xx','tp','unto','throughout','midst','ti') nuWrds06=('re','co','ib','cg','st','ar','li','ea','se','ra','la','ic','ag','lt') nuWrds07=('ing','ted','ist','ate','est', 'ere','less','ment','pro','ove','ance') nuWrds08=('ice','nde','ity','tte','pre','con','res','end','tat','ah','ort','op') nuWrds09=('ont','ning','ees','ven','han','ass','af','cc','ean','gee','ite','bu') nuWrds10=('ame','ree','ful','ene','eve','com','nal','ber','sse','tar','gn','ok') nuWrds11=('pea','ndi','hin','ther','dia','ting','nion','pak','ard','indi','ard') nuWrds12=('ini','led','ang','tor','ave','rot','tic','ery','wit','ding','ell') nuWrds13=('wan','goo','pur','sar','orr','ahe','arn','ona','tre','hap','cor','bs') nuWrds14=('tis','het','sible','dom','alt','cen','che','dre','ets','tary','los') stopWd.extend(nuWrds01); stopWd.extend(nuWrds02) stopWd.extend(nuWrds03); stopWd.extend(nuWrds04) stopWd.extend(nuWrds05); stopWd.extend(nuWrds06) stopWd.extend(nuWrds07); stopWd.extend(nuWrds08) stopWd.extend(nuWrds09); stopWd.extend(nuWrds10) stopWd.extend(nuWrds11); stopWd.extend(nuWrds12) stopWd.extend(nuWrds13); stopWd.extend(nuWrds14) #------------------------------------------------------------------------------ #open allows you to read the file pdff = open(fileName,'rb') nPg = PyPDF2.PdfFileReader(pdff).numPages i = firstPg lastPg = min(lastPg, nPg) text = "" #------------------------------------------------------------------------------ ''' import pdfplumber pdff = pdfplumber.open(fileName) # Iterate over all the pages - pdfplumber while i < lastPg: pgTxt = pdff.pages[i].extract_text() i = i + 1 text = text + pgTxt ''' #------------------------------------------------------------------------------ #PyMuPDF import fitz pdff = fitz.open(fileName) for pg in pdff.pages(firstPg, lastPg, step): text = text + pg.getText() #Extract page #'text' = variable which contains all the text derived from our PDF file # text = text.replace('\n', ' ') #Replace newline characters text = text.replace('\r', ' ') text = text.replace("- ", '') #Remove hyphens text = re.sub("[\w]-[\w]", '', text) text = re.sub(" [\w][\w] ", ' ', text) #Removes words of 1/2 characters text = re.sub(" [\w] ", ' ', text) #Remove words formed due to hyphenation text = re.sub(r"\b[\W]ing", "ing", text) text = re.sub(r"\b[\W]tion", "tion", text) text = re.sub(r"\b[\W]ment", "ment", text) #------------------------------------------------------------------------------ ''' #The while loop will read each page - PyPDF2 #The pdfReader variable is a readable object that will be parsed. Functionality #will not work well for some PDF files, e.g. file containing subsetted fonts pdfReader = PyPDF2.PdfFileReader(pdf) #discerning the number of pages will allow us to parse through all the pages nPages = pdfReader.numPages while i < lastPg: pageObj = pdfReader.getPage(i) pgTxt = pageObj.extractText() text = text + pgTxt i = i + 1 ''' # text = text.replace('\x0c','') #Remove tabs text = text.replace('\xa9','') #Remove Copyright symbols text = re.sub(r'\b([0-9])+\w', '', text) #Remove words starting a number #Change to Lower case each word text = text.lower() #------------------------------WRITE EXTRACTED TEXT---------------------------- f = open("textExtracted.txt", "w"); f.write(text); f.close() # #REGEX examples: search string and replace unwanted characters #Special characters: \ followed by a, A, b, B, d, D, s, S, w, W or Z cleanedTxt = re.findall(r'\b[a-zA-Z]\w+', text) trimmedTxt = ' '.join([str(wd) for wd in cleanedTxt]) #f = open("cleanedTxt.txt", "w"); f.write(trimmedTxt); f.close() #Remove stray words - for demonstration trimmedTxt = re.sub(r"\b ing ", '', trimmedTxt) trimmedTxt = re.sub(r"\b tion", '', trimmedTxt) trimmedTxt = re.sub(r"\b ment", '', trimmedTxt) #Remove stray words - using NLTK library #nltk.download('words') #if error = Resource words not found words = set(nltk.corpus.words.words()) " ".join(w for w in nltk.wordpunct_tokenize(trimmedTxt)) #Remove stop words from input text: tokenize (remove) stop words -> join words txtTokens = word_tokenize(trimmedTxt) txtNoStopWd = [word for word in txtTokens if not word in stopWd] #Create the string out of words trimmed by removing stop words finalText = ' '.join(txtNoStopWd) #f = open("finalTxt.txt", "w"); f.write(finalText); f.close() ''' with open("Keywords.txt", "w") as outfile: outfile.write("\n".join(str(item) for item in itemlist)) ''' #------------------------------------------------------------------------------ #Dataframe with unique keywords to avoid repetition in rows keywords = re.findall(r'[a-zA-Z]\w+', finalText) f = open("Keywords.txt", "w") for Wrd in keywords: f.write("%s\n" % Wrd) f.close() df = pd.DataFrame(list(set(keywords)),columns=['keywords']) #np.savetxt(r'dataFrame.txt', df.values, fmt='%s', delimiter=',') def weightage(word, y): wordList = re.findall(word, y) wordFreq =len(wordList) #Calculate Term Frequency tf = wordFreq/float(len(y)) #Calculate Inverse Document Frequency idf = np.log(1/float(wordFreq)) tf_idf = tf*idf #Round-off to 4 digits after decimal place tf = "{0:8.4f}".format(tf) idf = "{0:8.4f}".format(idf) tf_idf = "{0:8.4f}".format(tf_idf) return wordFreq, tf,idf ,tf_idf #------------------------------------------------------------------------------ #Print summary of words df['wordFreq'] = df['keywords'].apply(lambda x: weightage(x, finalText)[0]) df['tf'] = df['keywords'].apply(lambda x: weightage(x, finalText)[1]) df['idf'] = df['keywords'].apply(lambda x: weightage(x, finalText)[2]) df['tf_idf'] = df['keywords'].apply(lambda x: weightage(x, finalText)[3]) df = df.sort_values('wordFreq', ascending=False) df.to_csv('Keywords.csv') print(df.head(25))