Source code for kClusterLib.grossError

import numpy as np
import math
import logging
import matplotlib.pyplot as plt
from time import strftime
from scipy.stats import chi2
from db_connector import fetchRAWData
from usage import createClusters 
from kcTools import loadPcaData, loadKCFromDisk, separateClusters, isCobalt
from pca import extractRelations

from scipy.signal import tf2ss
#import numpy as np



previousSpnds =0
previousFiltered =0
count = 0


DEBUG = False

k = 1 #1.415*10**-20/60           #A.cm2.min.n-1  
tau_p = 23.68/60                  #min            
tau_d = 315.0/60                  #min            

T = 1     #sampling period is 1 min

A,B,C,D = tf2ss([k*tau_p, k], [tau_d, 1])
# if DEBUG:
#     print "A: ",A
#     print "B: ",B
#     print "C: ",C
#     print "D: ",D


global_ass=(2+A*T)/(2-A*T)
global_cbss=(C*B*T-A*D*T-2*D)/(2-A*T)
global_dss=(C*B*T-A*D*T+2*D)/(2-A*T)


#from filterV import ass, cbss, dss 


DEBUG = True 

fo = ''
fo1 = ''
logger=''


[docs]def configLOG():
    '''
    Configure the log file paths and Format of logging.

    Parameters
    ----------
    None: None
      Takes No Parameters

    Returns
    -------
    None

    '''
    global fo
    global fo1
    global logger
    logging.basicConfig(filename = 'grossError.csv',format='%(message)s',level=logging.DEBUG)
    logger = logging.getLogger(__name__)
    logger.debug('TIME STAMP'+','+'Ti'+','+ 'SPND' + ',' + 'CLUSTER'+','+'MESSAGE' +',' +'RAW DATA'+','+'BIAS' + ','+ 'GAMMA' +','+ 'CHI SQUARE')


    fo = open("count.csv", "a+")
    print "Name of the file: ", fo.name
    #print "Closed or not  : ", fo.closed
    print "Opening mode    : ", fo.mode
    #print "Softspace flag : ", fo.softspace


    fo1 = open("window.csv", "a+")
    fo1.write('TIME STAMP'+','+'TI' + ',' + 'SPND' + ',' + 'CLUSTER' + ','  + 'MESSAGE'+ ','+'WINDOWWIDTH' + ',' + 'THRESHOLD' + '\n')
    #print "Name of the file: ", fo1.name
    #print "Opening mode    : ", fo1.mode
    
[docs]def errorCheck(**kwargs):
    '''
    Uses pca-model to validate the data.

    Parameters
    ----------
    start_idx : int (optional)
      Start index for selecting values from database. Default is 1

    end_idx : int (optional)
      End index for selecting values from database. Default is 4000

    db : str
      Name of database to connect to for fetching data.

    db_host : str
      IP of machine running the MySQL server.

    u_name : str
      User name for connecting to remote MySQL server

    u_pass : str
      Password for connecting to remote MySQL server


    '''
    configLOG()
    ass  = global_ass[0][0]
    cbss = global_cbss[0][0]
    dss  = global_dss[0][0]

    ##..start and end index..##

    if 'start_idx' in kwargs:
        sidx = kwargs['start_idx'] 
    else:
        sidx= 1 # default value
    
    if 'end_idx' in kwargs:
        eidx = kwargs['end_idx'] 
    else:
        eidx= 4000 # default value

    if 'db' in kwargs:
        db = kwargs['db']
    else:
        db = 'EXPeriment' #default

    if 'db_host' in kwargs:
        db_host = kwargs['db_host']
    else:
        db_host = 'localhost' #default

    if 'u_name' in kwargs:
        user = kwargs['u_name']
    else:
        user = 'analyst'
    
    if 'u_pass' in kwargs:
        u_pass = kwargs['u_pass']
    else:
        u_pass = 'chemlab' #default password for the user
    


    ##..confidence level for chiSquared test..##
    alpha = 0.05

    ##..window test..##
    windowWidth = 50
    faultThreshold = 0.7

    ##............Getting data from DB.........................##
    rawDataMatCo, colNamesCo = fetchRAWData(debug=False, \
                                            dbase = db,
                                            db_host = db_host,
                                            table ='sensor_co',\
                                            user = user,\
                                            u_pass = u_pass, \
                                            idx_start=sidx, \
                                            idx_end=eidx)


    rawDataMatV, colNamesV = fetchRAWData(debug=False, \
                                            dbase = db,
                                            db_host = db_host,
                                            table ='sensor_v',\
                                            user = user,\
                                            u_pass = u_pass, \
                                            idx_start=sidx, \
                                            idx_end=eidx)


    rawDataMatCoV = np.concatenate((rawDataMatCo, rawDataMatV),axis=1)

    combinedLabels = np.concatenate((colNamesCo,colNamesV),axis=1)


    #main(npass = 8, start_idx=1,end_idx=14000)


    ##...Fetching Cluster Data...##
    cluster, inpMat, labels, combinedMean, combinedVar = loadKCFromDisk()


    ##...remove faulty spnd columns...##
    mask = [1 for i in  range(len(combinedLabels))] ##np.ones_like(combinedLabels)

    for i in combinedLabels:
        if not i in labels:
            mask[list(combinedLabels).index(i)] = 0
            # print "Faulty sensors:",i
            fo.write('Faulty sensors' + ',' + str(i) + '\n');
    bmask = np.array(mask, dtype = bool)
    cleanedLabels = combinedLabels[bmask]
    rawDataMatCoV = rawDataMatCoV[:,bmask]

    ##...rawDataMatCoV is the raw data after cleaning faulty SPNDs...##

    #rawDataMatCoV[0,18] += -20
    ##..SPND 121 to 132 are having value less than hundred..##


    rCoV, cCoV = rawDataMatCoV.shape
    print "Total number of clean SPNDs         :",cCoV

    fo.write('No. of Rows in rawDataMatCoV' + ',' + str(rCoV) + '\n');
    fo.write('Total number of clean SPNDs' + ',' + str(cCoV) + '\n');


    mask = [isCobalt( i, cleanedLabels) for i in range(len(rawDataMatCoV[0,:]))] ##No of cols
    bmask = np.array(mask, dtype = bool)
    matCo = rawDataMatCoV[:,bmask]

    rCo, cCo = matCo.shape
    print "Total number of clean Cobalt SPNDs  :",cCo
    fo.write('Total number of clean Cobalt SPNDs' + ',' + str(cCo) + '\n');


    cV = (cCoV-cCo)
    print "Total number of clean Vanadium SPNDs:",cV
    fo.write('Total number of clean Vanadium SPNDs' + ',' + str(cV) + '\n');

    ##..................................................................##


    clusterlist = separateClusters(cluster) 
    print "Number of Clusters: ", len(clusterlist)
    print "Cluster result:\n",clusterlist

    fo.write('Number of Clusters: ' + str(len(clusterlist))+'\n');
    for i in range(len(clusterlist)):
        clusterlist[i] = np.array(clusterlist[i])
        fo.write ('cluster ' + str(i) + ',' + '\n' + str(clusterlist[i]) + '\n');


    ##.........storing chi2.ppf values................##

    chi2ppf=[]
    tempchi2 =[]
    for n in range(cCoV): 
        p = chi2.ppf((1-alpha), n)
        tempchi2.append(p)
    chi2ppf.append(tempchi2)
    chi2ppf = np.array(chi2ppf)




    ##...Make a window which will keep updating if there is any fault in any SPND...##

    windowMat = np.zeros(shape = (windowWidth, cCoV))

    ## ........ LOAD PCA Model From Disk ........ ##

    Amat = []
    ResVarMat = []
    
    am, rm = loadPcaData()
    Amat.append(am)
    ResVarMat.append(rm)
    
    
    Amat = np.array(Amat)    
    ResVarMat = np.array(ResVarMat)

    #print "**** HERE is AmaT:"
    #print Amat.shape
    #print type(Amat)
    #print Amat
    #
    #print "**** HERE is ResVarMat:"
    #print ResVarMat.shape
    #print type(ResVarMat)
    #print ResVarMat

    ###.........storing PCA model..................##
    ##
    #threshVarCap = 0.01
    #Amat = []
    #ResVarMat = []
    #tempAmat = []
    #tempResVarMat = []
    #for clusterk in range(len(clusterlist)):
    #    a, resVar = extractRelations( clusterlist[clusterk], inpMat, threshVarCap )
    #    tempAmat.append(a)
    #    tempResVarMat.append(resVar)

    #Amat.append(tempAmat)
    #Amat = np.array(Amat)

    #ResVarMat.append(tempResVarMat)
    #ResVarMat = np.array(ResVarMat)

    #print "$$$$ HERE is AmaT:"
    #print Amat.shape
    #print type(Amat)
    #print Amat
    #
    #print "$$$$ HERE is ResVarMat:"
    #print ResVarMat.shape
    #print type(ResVarMat)
    #print ResVarMat
    #
    #exit()
    ###.......Fault count initialization................##


    spndFaultCount = np.zeros(shape = (1,cCoV))
    clusterFaultCount = np.zeros(shape= (1, len(clusterlist)))


    ##...for filtering...##
    previousCo = np.zeros(shape=(1,cCo))
    previousFilteredCo = np.zeros(shape=(1,cCo))



    for ti in range(eidx-sidx):
    ##...for each time ti...##
        for clusterj in range(len(clusterlist)):   
        ##...accessing clusters serially...##
            
            ##..Initializations                                      
            presentData = []
            tempy =[]
            x=[]
            y=[]
            
            tempPreFilCo = np.zeros(shape=(1,cCo))
            tempPreCo=np.zeros(shape=(1,cCo))
            flagIsNegative = False
            ##..initialization ENDS

            for spndk in range( len( clusterlist[clusterj] )): ##spndk will  
                                                               ##traverse all the 
                                                               ##spnds in that 
                                                               ##particular cluster  
                
                ##spndId is the currently selected sensor index
                spndId = clusterlist[clusterj][spndk]

               
                x = rawDataMatCoV[ti][spndId]
                ## x is the current value of kth SPND
                
                if spndId <= cCo-1: 
                ##...for Cobalt data  filter it...##
                        if x<=0 or np.isnan(x):
                            x= previousCo[0][spndId]
                        
                        y = ass*previousFilteredCo[0][spndId] + cbss*previousCo[0][spndId] + dss*x
                        
                        tempPreFilCo[0][spndId] = y
                        tempPreCo[0][spndId] = x
                        
                else:
                    y = x


                if x<=0 or np.isnan(x):

                    flagIsNegative = True
                    windowMat[windowWidth-1][spndId] = 1
                    if DEBUG:

                        print "Sensor output is zero/negative/'nan' for",\
                            ti,"th instance",\
                            spndId,"th SPND belongs to",\
                            clusterj,"'th cluster" 
                        logger.debug(str(strftime("%Y-%m-%d %H:%M:%S"))+','+str(ti)+','+ str(spndId) + ',' + str(clusterj)+','+'Sensor output is zero/negative/nan' +',' +str(x))


                    ##  Why don't we skip storing the temp values and not put them directly to the global 
                    ##  variables Because, We are processing a cluster which has many SPNDS, and the 
                    ##  NEXT SPND\ may have invalid data then we will discard the previous values

                    ## We discard all the values (for the currrent cluster) if any SPND is zero 
                    spndFaultCount[0][spndId] += 1

                
                                                       
                z = ( y - combinedMean[spndId]) / math.sqrt( combinedVar[spndId] )
                        
                tempy.append(z)
                        
            #tempy is a list that has the mean centered + scaled Sensor value

            if flagIsNegative:
            ##...if any entry is zero/negative/'nan' then delete all...##
            ##...entries for *that* time instance for *that* cluster...## 
                tempy = [] 
                presentData.append(tempy) 
                clusterFaultCount[0][clusterj] += 1      
            else :
                presentData.append(tempy)

            presentData = np.array(presentData)
            #print "presentData",presentData
            #print "tempPreCo\n",tempPreCo
            #print "tempPreFilCo\n",tempPreFilCo

            if not len(presentData[0]) == 0:

                ## If we are here, that means all the SPNDs of that cluster
                ## for that instance have meaningful values and we have 
                ## * centered
                ## * scaled
                ## * Co data filtered
                
                ##  Now store the cobalt values in global filter variables for future filtering
                 
                for k in range(cCo):
                    if tempPreFilCo[0][k] == 0:
                        #no updation required;
                        continue
                    else:
                        previousFilteredCo[0][k] = tempPreFilCo[0][k]
                    if tempPreCo[0][k] == 0:
                        # No need for updation of previous input values
                        continue
                    else:
                        previousCo[0][k] = tempPreCo[0][k]

                ## FETCHING Model
                A = Amat[0][clusterj]
                VarMatRes = ResVarMat[0][clusterj] 
                r, c = np.shape( A )

                ## CALCULATING RESIDUE r(t)
                presentResidualData = A * presentData.transpose()
                
                presentResidualData = np.array( presentResidualData )

                gamma = presentResidualData.transpose() * np.linalg.inv( VarMatRes ) * presentResidualData

                ## Residue is above threshold level
                if gamma >= chi2ppf[0][r]:

                    tempL=[]
                    tempB=[]
                    Lspndk=[]
                    Bspndk=[]

                    for spndk in range(len((clusterlist[clusterj]))):
                        Espndk= np.zeros(shape=(len(clusterlist[clusterj]),1))
                        Espndk[spndk][0]=1

                        Fspndk = A*Espndk

                        factor = Fspndk.transpose()*np.linalg.inv(VarMatRes)
                    
                        m1 = factor*Fspndk
                        m2 = factor*presentResidualData

                        bspndk = np.linalg.inv(m1)*m2
                        lspndk = m2.transpose()*bspndk
                        #bspndk = bspndk.tolist()
                        #lspndk = lspndk.tolist()
                        #print "lspndk",lspndk
                        #print "bspndk",bspndk

                    
                        tempL.append(lspndk[0,0])
                        tempB.append(bspndk[0,0])
                    #print tempL
                    #Lspndk.append(tempL)
                    Lspndk = tempL[:]
                    #print Lspndk
                    #Bspndk.append(tempB)
                    Bspndk = tempB[:]

                    #Lspndk = np.array(Lspndk)
                    #Bspndk = np.array(Bspndk)
                    #print "Lspndk",Lspndk
                    #print "Bspndk",Bspndk
                    
                    

                    #indexLspndk = np.where(LspndkTruncated == LspndkTruncated.max())
                    LspndkMax = max(Lspndk)
                    indexLspndk = Lspndk.index(LspndkMax)
                    #print "index for maximum Lspndk:",indexLspndk
                    Bspndk[indexLspndk] = (Bspndk[indexLspndk]*math.sqrt(combinedVar[clusterlist[clusterj][indexLspndk]]))
                    
                    print "\nChi-squared test violated in cluster",clusterj,"for",ti,"th time instance"


                    #logger.debug(clusterj)

                    print "gamma    : ",gamma[0,0]
                    
                    print "chisquare: ",chi2ppf[0][r]
                    
                    print "Log likelihood ratio maximizer (perhaps nonunique) SPND is",clusterlist[clusterj][indexLspndk]
                    
                    print "Estimated gross-error of the SPND : ",Bspndk[indexLspndk]
                    
                    #print "standard deviation                : ",math.sqrt(combinedVar[clusterlist[clusterj][indexLspndk]])
                    #print "mean                              : ",combinedMean[clusterlist[clusterj][indexLspndk]]
                    
                    print "Raw data of tha SPND              : ",rawDataMatCoV[ti][clusterlist[clusterj][indexLspndk]]
                   
                    logger.debug(str(strftime("%Y-%m-%d %H:%M:%S"))+','+str(ti)+','+str(clusterlist[clusterj][indexLspndk]) +','+ str(clusterj) +','+'chisquare test violated' +','+ str(rawDataMatCoV[ti][clusterlist[clusterj][indexLspndk]])+','+ str(round(Bspndk[indexLspndk],4)) + ','+ str(round(gamma[0,0],4)) +','+ str(round(chi2ppf[0][r],4)))


                    ##..check if the maximum value of Lspndk is repeating or not..##
                    
                    for i in range(len(Lspndk)):
                        if i != indexLspndk:
                            if Lspndk[i] == LspndkMax:
                                print "other Log likelihood ratio maximizer SPND is/are: ",clusterlist[clusterj][i]
                                logger.debug(str(strftime("%Y-%m-%d %H:%M:%S"))+','+str(ti)+','+'other Log likelihood ratio maximizer SPND is/are: ',+','+str(clusterlist[clusterj][i]))

                    
                    spndFaultCount[0][clusterlist[clusterj][indexLspndk]] += 1
                    clusterFaultCount[0][clusterj] += 1


                    windowMat[windowWidth-1][clusterlist[clusterj][indexLspndk]] = 1
                    #print "windowMat at",ti, "\n",windowMat[windowWidth-1,:]
            #print "previousCo\n",previousCo
            #print "previousFilteredCo\n",previousFilteredCo


        ##...to check multiple faults in the present window...##
        for spndk in range(cCoV):
            if np.sum(windowMat[:,spndk])/windowWidth > faultThreshold:
                #print "Faulty SPND is", spndk
                for cluster in range(len(clusterlist)):
                    for spnd in range(len(clusterlist[cluster])):
                        if clusterlist[cluster][spnd] == spndk:
                            print "Faulty SPND ", spndk,"cluster",cluster, "at ti",ti
                            
                            fo1.write(str(strftime("%Y-%m-%d %H:%M:%S")) + ',' + str(ti) +','+ str(spndk) + ',' + str(cluster) + ',' + 'Faulty SPND (window test)' +',' +str(windowWidth)+','+str(faultThreshold)+ '\n');


        for width in range(windowWidth-1):
            windowMat[width]= windowMat[width+1]
        windowMat[windowWidth-1] = np.zeros(shape=(1,cCoV))




    print "spndFaultCount\n",spndFaultCount[0]
    maxFault = np.max(spndFaultCount[0])
    fo.write ('spndFaultCount' + ',' +'\n' + str(spndFaultCount[0]) + '\n');

    plt.figure()
    index1 = range(len(spndFaultCount[0]))
    plt.bar(index1,spndFaultCount[0])
    plt.xlabel('SPND')
    plt.ylabel('Fault Count')
    plt.title('SPND Fault Count')
    #plt.grid(True)
    plt.savefig ('spndFaultCount.pdf')
    #plt.show()


    plt.figure()
    index2 = range(cCo)
    coFaultCount = spndFaultCount[:,range(cCo)]
    # print coFaultCount
    plt.bar(index2,coFaultCount[0])
    plt.xlabel('Co SPND')
    plt.ylabel('Fault Count')
    plt.title('Co SPND Fault Count'+ ' '+'[' + str(0) + ' ' +'to' +' '+ str(cCo-1) + ']')
    #plt.grid(True)
    plt.ylim([0, (maxFault*1.05)])
    plt.savefig ('cobaltFaultCount.pdf')
    #plt.show()


    plt.figure()
    index3 = range(cCo,(cCo + (cCoV-cCo)/2))
    vFaultCount1 = spndFaultCount[:,range(cCo,(cCo + (cCoV-cCo)/2))]
    plt.bar(index3,vFaultCount1[0])
    plt.xlabel('V SPND')
    plt.ylabel('Fault Count')
    plt.title('V SPND Fault Count' + ' '+'[' + str(cCo) + ' ' +'to' +' '+ str((cCo + (cCoV-cCo)/2)-1) + ']')
    #plt.grid(True)
    plt.ylim([0, (maxFault*1.05)])
    plt.savefig ('vanadiumFaultCount1.pdf')
    #plt.show()


    plt.figure()
    index4 = range((cCo + (cCoV-cCo)/2),cCoV)
    vFaultCount2 = spndFaultCount[:,range((cCo + (cCoV-cCo)/2),cCoV)]
    plt.bar(index4,vFaultCount2[0])
    plt.xlabel('V SPND')
    plt.ylabel('Fault Count')
    plt.title('V SPND Fault Count'+ ' ' +'['+ str((cCo + (cCoV-cCo)/2)) + ' ' +'to' + ' ' + str(cCoV-1) + ']')
    #plt.grid(True)
    plt.ylim([0, (maxFault*1.05)])
    plt.savefig ('vanadiumFaultCount2.pdf')
    #plt.show()



    expectedLimit = (eidx-sidx+1)*alpha

    print "clusterFaultCount\n",clusterFaultCount[0]
    fo.write ('clusterFaultCount' + ',' + '\n' + str(clusterFaultCount[0]) + '\n');

    plt.figure()
    index5 = range(len(clusterFaultCount[0]))
    plt.bar(index5, clusterFaultCount[0])
    plt.axhline(y= expectedLimit, xmin=0, xmax=len(clusterFaultCount[0]) , linewidth=2,linestyle='--', color = 'r')
    plt.xlabel('CLUSTER')
    plt.ylabel('Fault Count')
    plt.title('Cluster Fault Count')
    #plt.grid(True)
    plt.savefig ('clusterFaultCount.pdf')
    #plt.show()

    #print "spndFaultCount percentage\n",spndFaultCount[0]*100/(eidx-sidx+1)
    #print "clusterFaultCount percentage\n", clusterFaultCount[0]*100/(eidx-sidx+1)


    p=0
    for i in range(len(clusterlist)):
        p += clusterFaultCount[0][i]
    print "Total number of instances of fault occurance in all clusters:", p
    fo.write ('Total number of instances of fault occurance in all clusters' + ': '+ str(p) + '\n');


    frequencyFaultCluster = clusterFaultCount[0]/(eidx-sidx+1)
    print "frequencyFaultCluster\n",frequencyFaultCluster
    fo.write ('frequencyFaultCluster' + ',' + '\n' + str(frequencyFaultCluster) + '\n');

    plt.figure()
    index6 = range(len(frequencyFaultCluster))
    plt.bar(index6, frequencyFaultCluster)
    plt.axhline(y= alpha, xmin=0, xmax=len(clusterFaultCount[0]) , linewidth=2,linestyle='--', color = 'r')
    plt.xlabel('CLUSTER')
    plt.ylabel('Frequency of fault')
    plt.title('Frequency of Cluster Fault')
    plt.savefig ('FerqClusterFaultCount.pdf')
    #plt.grid(True)
    #plt.show()
    
if __name__ == '__main__':
    errorCheck()