Source code for kClusterLib.grossError

import numpy as np
import math
import logging
import matplotlib.pyplot as plt
from time import strftime
from scipy.stats import chi2
from db_connector import fetchRAWData
from usage import createClusters 
from kcTools import loadPcaData, loadKCFromDisk, separateClusters, isCobalt
from pca import extractRelations

from scipy.signal import tf2ss
#import numpy as np



previousSpnds =0
previousFiltered =0
count = 0


DEBUG = False

k = 1 #1.415*10**-20/60           #A.cm2.min.n-1  
tau_p = 23.68/60                  #min            
tau_d = 315.0/60                  #min            

T = 1     #sampling period is 1 min

A,B,C,D = tf2ss([k*tau_p, k], [tau_d, 1])
# if DEBUG:
#     print "A: ",A
#     print "B: ",B
#     print "C: ",C
#     print "D: ",D


global_ass=(2+A*T)/(2-A*T)
global_cbss=(C*B*T-A*D*T-2*D)/(2-A*T)
global_dss=(C*B*T-A*D*T+2*D)/(2-A*T)


#from filterV import ass, cbss, dss 


DEBUG = True 

fo = ''
fo1 = ''
logger=''


[docs]def configLOG(): ''' Configure the log file paths and Format of logging. Parameters ---------- None: None Takes No Parameters Returns ------- None ''' global fo global fo1 global logger logging.basicConfig(filename = 'grossError.csv',format='%(message)s',level=logging.DEBUG) logger = logging.getLogger(__name__) logger.debug('TIME STAMP'+','+'Ti'+','+ 'SPND' + ',' + 'CLUSTER'+','+'MESSAGE' +',' +'RAW DATA'+','+'BIAS' + ','+ 'GAMMA' +','+ 'CHI SQUARE') fo = open("count.csv", "a+") print "Name of the file: ", fo.name #print "Closed or not : ", fo.closed print "Opening mode : ", fo.mode #print "Softspace flag : ", fo.softspace fo1 = open("window.csv", "a+") fo1.write('TIME STAMP'+','+'TI' + ',' + 'SPND' + ',' + 'CLUSTER' + ',' + 'MESSAGE'+ ','+'WINDOWWIDTH' + ',' + 'THRESHOLD' + '\n') #print "Name of the file: ", fo1.name #print "Opening mode : ", fo1.mode
[docs]def errorCheck(**kwargs): ''' Uses pca-model to validate the data. Parameters ---------- start_idx : int (optional) Start index for selecting values from database. Default is 1 end_idx : int (optional) End index for selecting values from database. Default is 4000 db : str Name of database to connect to for fetching data. db_host : str IP of machine running the MySQL server. u_name : str User name for connecting to remote MySQL server u_pass : str Password for connecting to remote MySQL server ''' configLOG() ass = global_ass[0][0] cbss = global_cbss[0][0] dss = global_dss[0][0] ##..start and end index..## if 'start_idx' in kwargs: sidx = kwargs['start_idx'] else: sidx= 1 # default value if 'end_idx' in kwargs: eidx = kwargs['end_idx'] else: eidx= 4000 # default value if 'db' in kwargs: db = kwargs['db'] else: db = 'EXPeriment' #default if 'db_host' in kwargs: db_host = kwargs['db_host'] else: db_host = 'localhost' #default if 'u_name' in kwargs: user = kwargs['u_name'] else: user = 'analyst' if 'u_pass' in kwargs: u_pass = kwargs['u_pass'] else: u_pass = 'chemlab' #default password for the user ##..confidence level for chiSquared test..## alpha = 0.05 ##..window test..## windowWidth = 50 faultThreshold = 0.7 ##............Getting data from DB.........................## rawDataMatCo, colNamesCo = fetchRAWData(debug=False, \ dbase = db, db_host = db_host, table ='sensor_co',\ user = user,\ u_pass = u_pass, \ idx_start=sidx, \ idx_end=eidx) rawDataMatV, colNamesV = fetchRAWData(debug=False, \ dbase = db, db_host = db_host, table ='sensor_v',\ user = user,\ u_pass = u_pass, \ idx_start=sidx, \ idx_end=eidx) rawDataMatCoV = np.concatenate((rawDataMatCo, rawDataMatV),axis=1) combinedLabels = np.concatenate((colNamesCo,colNamesV),axis=1) #main(npass = 8, start_idx=1,end_idx=14000) ##...Fetching Cluster Data...## cluster, inpMat, labels, combinedMean, combinedVar = loadKCFromDisk() ##...remove faulty spnd columns...## mask = [1 for i in range(len(combinedLabels))] ##np.ones_like(combinedLabels) for i in combinedLabels: if not i in labels: mask[list(combinedLabels).index(i)] = 0 # print "Faulty sensors:",i fo.write('Faulty sensors' + ',' + str(i) + '\n'); bmask = np.array(mask, dtype = bool) cleanedLabels = combinedLabels[bmask] rawDataMatCoV = rawDataMatCoV[:,bmask] ##...rawDataMatCoV is the raw data after cleaning faulty SPNDs...## #rawDataMatCoV[0,18] += -20 ##..SPND 121 to 132 are having value less than hundred..## rCoV, cCoV = rawDataMatCoV.shape print "Total number of clean SPNDs :",cCoV fo.write('No. of Rows in rawDataMatCoV' + ',' + str(rCoV) + '\n'); fo.write('Total number of clean SPNDs' + ',' + str(cCoV) + '\n'); mask = [isCobalt( i, cleanedLabels) for i in range(len(rawDataMatCoV[0,:]))] ##No of cols bmask = np.array(mask, dtype = bool) matCo = rawDataMatCoV[:,bmask] rCo, cCo = matCo.shape print "Total number of clean Cobalt SPNDs :",cCo fo.write('Total number of clean Cobalt SPNDs' + ',' + str(cCo) + '\n'); cV = (cCoV-cCo) print "Total number of clean Vanadium SPNDs:",cV fo.write('Total number of clean Vanadium SPNDs' + ',' + str(cV) + '\n'); ##..................................................................## clusterlist = separateClusters(cluster) print "Number of Clusters: ", len(clusterlist) print "Cluster result:\n",clusterlist fo.write('Number of Clusters: ' + str(len(clusterlist))+'\n'); for i in range(len(clusterlist)): clusterlist[i] = np.array(clusterlist[i]) fo.write ('cluster ' + str(i) + ',' + '\n' + str(clusterlist[i]) + '\n'); ##.........storing chi2.ppf values................## chi2ppf=[] tempchi2 =[] for n in range(cCoV): p = chi2.ppf((1-alpha), n) tempchi2.append(p) chi2ppf.append(tempchi2) chi2ppf = np.array(chi2ppf) ##...Make a window which will keep updating if there is any fault in any SPND...## windowMat = np.zeros(shape = (windowWidth, cCoV)) ## ........ LOAD PCA Model From Disk ........ ## Amat = [] ResVarMat = [] am, rm = loadPcaData() Amat.append(am) ResVarMat.append(rm) Amat = np.array(Amat) ResVarMat = np.array(ResVarMat) #print "**** HERE is AmaT:" #print Amat.shape #print type(Amat) #print Amat # #print "**** HERE is ResVarMat:" #print ResVarMat.shape #print type(ResVarMat) #print ResVarMat ###.........storing PCA model..................## ## #threshVarCap = 0.01 #Amat = [] #ResVarMat = [] #tempAmat = [] #tempResVarMat = [] #for clusterk in range(len(clusterlist)): # a, resVar = extractRelations( clusterlist[clusterk], inpMat, threshVarCap ) # tempAmat.append(a) # tempResVarMat.append(resVar) #Amat.append(tempAmat) #Amat = np.array(Amat) #ResVarMat.append(tempResVarMat) #ResVarMat = np.array(ResVarMat) #print "$$$$ HERE is AmaT:" #print Amat.shape #print type(Amat) #print Amat # #print "$$$$ HERE is ResVarMat:" #print ResVarMat.shape #print type(ResVarMat) #print ResVarMat # #exit() ###.......Fault count initialization................## spndFaultCount = np.zeros(shape = (1,cCoV)) clusterFaultCount = np.zeros(shape= (1, len(clusterlist))) ##...for filtering...## previousCo = np.zeros(shape=(1,cCo)) previousFilteredCo = np.zeros(shape=(1,cCo)) for ti in range(eidx-sidx): ##...for each time ti...## for clusterj in range(len(clusterlist)): ##...accessing clusters serially...## ##..Initializations presentData = [] tempy =[] x=[] y=[] tempPreFilCo = np.zeros(shape=(1,cCo)) tempPreCo=np.zeros(shape=(1,cCo)) flagIsNegative = False ##..initialization ENDS for spndk in range( len( clusterlist[clusterj] )): ##spndk will ##traverse all the ##spnds in that ##particular cluster ##spndId is the currently selected sensor index spndId = clusterlist[clusterj][spndk] x = rawDataMatCoV[ti][spndId] ## x is the current value of kth SPND if spndId <= cCo-1: ##...for Cobalt data filter it...## if x<=0 or np.isnan(x): x= previousCo[0][spndId] y = ass*previousFilteredCo[0][spndId] + cbss*previousCo[0][spndId] + dss*x tempPreFilCo[0][spndId] = y tempPreCo[0][spndId] = x else: y = x if x<=0 or np.isnan(x): flagIsNegative = True windowMat[windowWidth-1][spndId] = 1 if DEBUG: print "Sensor output is zero/negative/'nan' for",\ ti,"th instance",\ spndId,"th SPND belongs to",\ clusterj,"'th cluster" logger.debug(str(strftime("%Y-%m-%d %H:%M:%S"))+','+str(ti)+','+ str(spndId) + ',' + str(clusterj)+','+'Sensor output is zero/negative/nan' +',' +str(x)) ## Why don't we skip storing the temp values and not put them directly to the global ## variables Because, We are processing a cluster which has many SPNDS, and the ## NEXT SPND\ may have invalid data then we will discard the previous values ## We discard all the values (for the currrent cluster) if any SPND is zero spndFaultCount[0][spndId] += 1 z = ( y - combinedMean[spndId]) / math.sqrt( combinedVar[spndId] ) tempy.append(z) #tempy is a list that has the mean centered + scaled Sensor value if flagIsNegative: ##...if any entry is zero/negative/'nan' then delete all...## ##...entries for *that* time instance for *that* cluster...## tempy = [] presentData.append(tempy) clusterFaultCount[0][clusterj] += 1 else : presentData.append(tempy) presentData = np.array(presentData) #print "presentData",presentData #print "tempPreCo\n",tempPreCo #print "tempPreFilCo\n",tempPreFilCo if not len(presentData[0]) == 0: ## If we are here, that means all the SPNDs of that cluster ## for that instance have meaningful values and we have ## * centered ## * scaled ## * Co data filtered ## Now store the cobalt values in global filter variables for future filtering for k in range(cCo): if tempPreFilCo[0][k] == 0: #no updation required; continue else: previousFilteredCo[0][k] = tempPreFilCo[0][k] if tempPreCo[0][k] == 0: # No need for updation of previous input values continue else: previousCo[0][k] = tempPreCo[0][k] ## FETCHING Model A = Amat[0][clusterj] VarMatRes = ResVarMat[0][clusterj] r, c = np.shape( A ) ## CALCULATING RESIDUE r(t) presentResidualData = A * presentData.transpose() presentResidualData = np.array( presentResidualData ) gamma = presentResidualData.transpose() * np.linalg.inv( VarMatRes ) * presentResidualData ## Residue is above threshold level if gamma >= chi2ppf[0][r]: tempL=[] tempB=[] Lspndk=[] Bspndk=[] for spndk in range(len((clusterlist[clusterj]))): Espndk= np.zeros(shape=(len(clusterlist[clusterj]),1)) Espndk[spndk][0]=1 Fspndk = A*Espndk factor = Fspndk.transpose()*np.linalg.inv(VarMatRes) m1 = factor*Fspndk m2 = factor*presentResidualData bspndk = np.linalg.inv(m1)*m2 lspndk = m2.transpose()*bspndk #bspndk = bspndk.tolist() #lspndk = lspndk.tolist() #print "lspndk",lspndk #print "bspndk",bspndk tempL.append(lspndk[0,0]) tempB.append(bspndk[0,0]) #print tempL #Lspndk.append(tempL) Lspndk = tempL[:] #print Lspndk #Bspndk.append(tempB) Bspndk = tempB[:] #Lspndk = np.array(Lspndk) #Bspndk = np.array(Bspndk) #print "Lspndk",Lspndk #print "Bspndk",Bspndk #indexLspndk = np.where(LspndkTruncated == LspndkTruncated.max()) LspndkMax = max(Lspndk) indexLspndk = Lspndk.index(LspndkMax) #print "index for maximum Lspndk:",indexLspndk Bspndk[indexLspndk] = (Bspndk[indexLspndk]*math.sqrt(combinedVar[clusterlist[clusterj][indexLspndk]])) print "\nChi-squared test violated in cluster",clusterj,"for",ti,"th time instance" #logger.debug(clusterj) print "gamma : ",gamma[0,0] print "chisquare: ",chi2ppf[0][r] print "Log likelihood ratio maximizer (perhaps nonunique) SPND is",clusterlist[clusterj][indexLspndk] print "Estimated gross-error of the SPND : ",Bspndk[indexLspndk] #print "standard deviation : ",math.sqrt(combinedVar[clusterlist[clusterj][indexLspndk]]) #print "mean : ",combinedMean[clusterlist[clusterj][indexLspndk]] print "Raw data of tha SPND : ",rawDataMatCoV[ti][clusterlist[clusterj][indexLspndk]] logger.debug(str(strftime("%Y-%m-%d %H:%M:%S"))+','+str(ti)+','+str(clusterlist[clusterj][indexLspndk]) +','+ str(clusterj) +','+'chisquare test violated' +','+ str(rawDataMatCoV[ti][clusterlist[clusterj][indexLspndk]])+','+ str(round(Bspndk[indexLspndk],4)) + ','+ str(round(gamma[0,0],4)) +','+ str(round(chi2ppf[0][r],4))) ##..check if the maximum value of Lspndk is repeating or not..## for i in range(len(Lspndk)): if i != indexLspndk: if Lspndk[i] == LspndkMax: print "other Log likelihood ratio maximizer SPND is/are: ",clusterlist[clusterj][i] logger.debug(str(strftime("%Y-%m-%d %H:%M:%S"))+','+str(ti)+','+'other Log likelihood ratio maximizer SPND is/are: ',+','+str(clusterlist[clusterj][i])) spndFaultCount[0][clusterlist[clusterj][indexLspndk]] += 1 clusterFaultCount[0][clusterj] += 1 windowMat[windowWidth-1][clusterlist[clusterj][indexLspndk]] = 1 #print "windowMat at",ti, "\n",windowMat[windowWidth-1,:] #print "previousCo\n",previousCo #print "previousFilteredCo\n",previousFilteredCo ##...to check multiple faults in the present window...## for spndk in range(cCoV): if np.sum(windowMat[:,spndk])/windowWidth > faultThreshold: #print "Faulty SPND is", spndk for cluster in range(len(clusterlist)): for spnd in range(len(clusterlist[cluster])): if clusterlist[cluster][spnd] == spndk: print "Faulty SPND ", spndk,"cluster",cluster, "at ti",ti fo1.write(str(strftime("%Y-%m-%d %H:%M:%S")) + ',' + str(ti) +','+ str(spndk) + ',' + str(cluster) + ',' + 'Faulty SPND (window test)' +',' +str(windowWidth)+','+str(faultThreshold)+ '\n'); for width in range(windowWidth-1): windowMat[width]= windowMat[width+1] windowMat[windowWidth-1] = np.zeros(shape=(1,cCoV)) print "spndFaultCount\n",spndFaultCount[0] maxFault = np.max(spndFaultCount[0]) fo.write ('spndFaultCount' + ',' +'\n' + str(spndFaultCount[0]) + '\n'); plt.figure() index1 = range(len(spndFaultCount[0])) plt.bar(index1,spndFaultCount[0]) plt.xlabel('SPND') plt.ylabel('Fault Count') plt.title('SPND Fault Count') #plt.grid(True) plt.savefig ('spndFaultCount.pdf') #plt.show() plt.figure() index2 = range(cCo) coFaultCount = spndFaultCount[:,range(cCo)] # print coFaultCount plt.bar(index2,coFaultCount[0]) plt.xlabel('Co SPND') plt.ylabel('Fault Count') plt.title('Co SPND Fault Count'+ ' '+'[' + str(0) + ' ' +'to' +' '+ str(cCo-1) + ']') #plt.grid(True) plt.ylim([0, (maxFault*1.05)]) plt.savefig ('cobaltFaultCount.pdf') #plt.show() plt.figure() index3 = range(cCo,(cCo + (cCoV-cCo)/2)) vFaultCount1 = spndFaultCount[:,range(cCo,(cCo + (cCoV-cCo)/2))] plt.bar(index3,vFaultCount1[0]) plt.xlabel('V SPND') plt.ylabel('Fault Count') plt.title('V SPND Fault Count' + ' '+'[' + str(cCo) + ' ' +'to' +' '+ str((cCo + (cCoV-cCo)/2)-1) + ']') #plt.grid(True) plt.ylim([0, (maxFault*1.05)]) plt.savefig ('vanadiumFaultCount1.pdf') #plt.show() plt.figure() index4 = range((cCo + (cCoV-cCo)/2),cCoV) vFaultCount2 = spndFaultCount[:,range((cCo + (cCoV-cCo)/2),cCoV)] plt.bar(index4,vFaultCount2[0]) plt.xlabel('V SPND') plt.ylabel('Fault Count') plt.title('V SPND Fault Count'+ ' ' +'['+ str((cCo + (cCoV-cCo)/2)) + ' ' +'to' + ' ' + str(cCoV-1) + ']') #plt.grid(True) plt.ylim([0, (maxFault*1.05)]) plt.savefig ('vanadiumFaultCount2.pdf') #plt.show() expectedLimit = (eidx-sidx+1)*alpha print "clusterFaultCount\n",clusterFaultCount[0] fo.write ('clusterFaultCount' + ',' + '\n' + str(clusterFaultCount[0]) + '\n'); plt.figure() index5 = range(len(clusterFaultCount[0])) plt.bar(index5, clusterFaultCount[0]) plt.axhline(y= expectedLimit, xmin=0, xmax=len(clusterFaultCount[0]) , linewidth=2,linestyle='--', color = 'r') plt.xlabel('CLUSTER') plt.ylabel('Fault Count') plt.title('Cluster Fault Count') #plt.grid(True) plt.savefig ('clusterFaultCount.pdf') #plt.show() #print "spndFaultCount percentage\n",spndFaultCount[0]*100/(eidx-sidx+1) #print "clusterFaultCount percentage\n", clusterFaultCount[0]*100/(eidx-sidx+1) p=0 for i in range(len(clusterlist)): p += clusterFaultCount[0][i] print "Total number of instances of fault occurance in all clusters:", p fo.write ('Total number of instances of fault occurance in all clusters' + ': '+ str(p) + '\n'); frequencyFaultCluster = clusterFaultCount[0]/(eidx-sidx+1) print "frequencyFaultCluster\n",frequencyFaultCluster fo.write ('frequencyFaultCluster' + ',' + '\n' + str(frequencyFaultCluster) + '\n'); plt.figure() index6 = range(len(frequencyFaultCluster)) plt.bar(index6, frequencyFaultCluster) plt.axhline(y= alpha, xmin=0, xmax=len(clusterFaultCount[0]) , linewidth=2,linestyle='--', color = 'r') plt.xlabel('CLUSTER') plt.ylabel('Frequency of fault') plt.title('Frequency of Cluster Fault') plt.savefig ('FerqClusterFaultCount.pdf') #plt.grid(True) #plt.show()
if __name__ == '__main__': errorCheck()