#------------------------------------------------------------------------------
#Converts the number of connected pixels above a threshold size to white
#It uses CCL: Connected Component Labeling method to find noisy pixels
#------------------------------------------------------------------------------
import sys, os
import numpy as np
import cv2

#------------------------------------------------------------------------------
if (len(sys.argv) < 3):
    print("\nUsage: python3 {} image_file threshold \n".format(sys.argv[0]))
    sys.exit(1)
#------------------------------------------------------------------------------

#Get input file
inFile = str(sys.argv[1])
threshold = int(sys.argv[2])

#Define names of the output files
file_path = os.getcwd() + "/" + inFile
extn = os.path.splitext(file_path)[1]

outFile0 = file_path.strip(extn) + "-GR.png"
outFile1 = file_path.strip(extn) + "-BW.png"
outFile2 = file_path.strip(extn) + "-WB.png"
outFile3 = file_path.strip(extn) + "-CL.png"

black = 0
white = 255

#Define size (number of pixels) a component should consist of. Note that text
#may be connected and a smaller number will wipe out texts from the image. This
#values needs to be worked by trial-and-error for each type of image.
arsz = 2000

#Open input image in grayscale mode and get its pixels
imgGray = cv2.imread(inFile, 0) 
#imgGray = cv2.cvtColor(f, cv2.COLOR_BGR2GRAY)
cv2.imwrite(outFile0, imgGray)

pixels = np.array(imgGray)[:,:]
#Change pixels above threshold to white (255)
pixels[pixels > threshold] = white
pixels[pixels < threshold] = black
imgBW = pixels
cv2.imwrite(outFile1, imgBW)

imgWB = cv2.bitwise_not(imgBW)
cv2.imwrite(outFile2, imgWB)

# Apply the Component analysis function, CV_32S is output image label type
analysis = cv2.connectedComponentsWithStats(imgWB, 8, cv2.CV_32S)
#nLabels is the total number of labels where 0 represents the background label.
#A label is assigned to each pixel based on its location and neighbours. If a
#pixel is black (value = 0), it is skipped as default label is '0'. Thus, each
#connected region shall be labeled 1, 2, 3... nLabels-1.

(nLabels, labels, values, centroid) = analysis
#nLabels = analysis[0], labels = analysis[1], values = analsysi[2]...
#labels is matrix of the size of input image, each element has value = its label

#Create mask of same dimensions as image, pixel valued 0=black,255=white
imgBlack = np.zeros(imgBW.shape, dtype="uint8") #Demo only - no used later
imgWhite = np.ones(imgBW.shape, dtype="uint8")*255

# Loop through each component
for i in range(1, nLabels):
    #Find area of the connected components
    area = values[i, cv2.CC_STAT_AREA]
    if (area > arsz):
        #Create a mask for each label value. 'labels' is a matrix of same size 
        #as input image. For each elements of matrix 'labels' equals component 
        #id 'i', create a copy of that matrix of type unsigned-integer-8-bit.
        #Construct a mask for the current connected component by finding a 
        #pixels in the labels array that have current connected component ID
        cMask = (labels == i).astype("uint8") * 255
        #x = np.array([1.2, 2.3, 5.6]), x.astype(int) = array([1, 2, 6])
        #
        imgBW = cv2.bitwise_or(imgBW, cMask)

cv2.imwrite(outFile3, imgBW)