Source code for kClusterLib.pca

import numpy as np
from kcTools import loadKCFromDisk, separateClusters
from kcTools import halt
from kcTools import savePcaData
from usage import createClusters
import time

# debug message printing control
DEBUG = False#True




[docs]def extractRelations( spnd_ids, inpMat, threshVarCap):
    '''
    Makes pca model and saves it in a '.np' file.

    Parameters
    ----------
    spnd_ids : list 
      list of SPND IDs that belong to same cluster

    inpMat : ndArray
      Matrix containing the SPND values
      
    threshVarCap : float
      Percent Threshold (Given as fraction) to select the Eigenvalues.


    Returns
    -------
    relationEvectors : ndArray
      Singular Matrix

    covResidualMat : ndArray
      covariance Matrix



    Example
    -------

    >>> cluster, inpMat, labels, combinedMean, combinedVar = loadKCFromDisk()
    >>> clusterlist=separateClusters(cluster)
    >>> threshVarCap=0.01
    >>> for i in range(max(cluster)+1):
    >>>     a,b = extractRelations(clusterlist[i],inpMat,threshVarCap)
    >>>     print a,b
    

    '''
    #each cluster is having some sensors' name
    #callcluster function form a matrix which 
    #is having the sample data corresponding  
    #to those sensors

    timeinstances,spnds =np.shape(np.matrix(inpMat))
    clusterMat=[]
    cluster = spnd_ids
    temp=[]
    for i in range(len(cluster)):
        x = inpMat[:,cluster[i]]
        temp.append(x)
    
    clusterMat.append(temp)

    clusterMat= np.array(clusterMat)
    
    clusterMat = np.matrix(clusterMat)

    covMat= (clusterMat * clusterMat.transpose()) / timeinstances
    
    #print "Covariance matrix:\n",covMat
    
    evalues, evectors = np.linalg.eigh(covMat)
    evectors = evectors.transpose() ##...each column is the eigenvector
    
    #print "evectors:\n", evectors 
    forceFlag = False
    if np.sum(evalues) == 0:
        print "Error: sum of eigenvalues is zero"
        halt()
        ## flag
        forceFlag = True
        print "WARNING : further calculations will be based on sum(evalues)=1 !"
 
    idx = evalues.argsort()[0::]   #for sorting in ascending order
    #idx = evalues.argsort()[::-1] #for sorting in descending order
    evalues= evalues[idx]
    evectors= evectors[:,idx]    


    #print "Eigenvalues:\n", evalues,"\nEigenvectors:\n", evectors
    if DEBUG:
        print "\nNumber of Eigenvalues:", len(evalues)
    

    relationEvectors=[]
    relationEvalues=[]
    sumevalues = 0
    if not forceFlag:
        sumofallevalues = np.sum(evalues)
    else:
        sumofallevalues = 1

    for i in range(len(evalues)):
        sumevalues +=evalues[i]
        r = sumevalues/sumofallevalues
        #print "ratio",r 

        if r<=threshVarCap:
            relationEvectors.append(evectors[i])
            relationEvalues.append(evalues[i])
        else:
            break

    if DEBUG:
        print "Number of validRelations:", len(np.array(relationEvectors))

    if len(np.array(relationEvectors)) == 0:

        relationEvectors=evectors[0]
        relationEvalues=evalues[0]
        
        print "No valid relation has been found thus the smallest eigenvalue &  corresponding eigenvector have  been taken!!!"
    
    
    relationEvectors = np.array(relationEvectors)
    relationEvectors = np.matrix(relationEvectors)


    if DEBUG:

        print "covMat size: ",covMat.shape
    
    residualMat = relationEvectors*clusterMat
    #print "residualMat:\n",residualMat
    

    # time profiling result: method II works faster
    # t0 = time.time()
    # covResidualMat1 = residualMat*residualMat.transpose()/timeinstances
    # t1 = time.time()
    # print "timetaken for calculation1 : {}".format(t1-t0)
    # t1 = time.time()
    covResidualMat= relationEvectors*covMat*relationEvectors.transpose()
    # print "timetaken for calculation2 : {}".format(time.time()-t1)
    

    if DEBUG:
        print "relationEvectors(A) size:",relationEvectors.shape
        print "covResidualMat(V) size: ", covResidualMat.shape
        
        #print "covResidualMat(V):\n",covResidualMat
    #print "covResidualMat1 size: ", covResidualMat1.shape
    #print "covResidualMat1:\n",covResidualMat1
    #print "inv of covResidualMat(V):\n",np.linalg.inv(covResidualMat)

    return relationEvectors, covResidualMat


[docs]def makeModels( eigenThreshold, clusterMap, dataMat ):
    '''
    Creates and stores PCA Models in disk. 

    Parameters
    ----------
    eigenThreshold : float
      Percentage Value to filter out the Eigen values.

    dataMat : ndArray
      Sensor Data matrix on which model is to be created

    clusterMap : list
      Mapping of SPND <--> Cluster ID
    
    Returns
    -------
    fname : string
      Name of file created on disk.
    
    '''
    #cluster, inpMat, labels, combinedMean, combinedVar = loadKCFromDisk()
    
    #if 'eigenThreshold' in kwargs:
    #    threshVarCap = kwargs['eigenThreshold']
    #else:
    #threshVarCap = 0.01

    threshVarCap = eigenThreshold
    inpMat = dataMat
    clusterList = separateClusters(clusterMap)
    #totalA = np.array([])
    totalA = []
    #totalB = np.array([])
    totalB = []
    for i in range(max(clusterMap)+1):
        a,b = extractRelations(clusterList[i], inpMat, threshVarCap)
        #print type(a), type(b)
        totalA.append(a)
        totalB.append(b)
    return savePcaData(totalA,totalB)

if __name__ == '__main__':
    
    clusterMap, dataMat, labels, combinedMean, combinedVar = loadKCFromDisk()
    makeModels(0.01, clusterMap, dataMat)
    exit()
    
    #createClusters(npass=8)

    cluster, inpMat, labels, combinedMean, combinedVar = loadKCFromDisk()

    if DEBUG:
        print "Total number of clusters: ",max(cluster)+1

    clusterlist=separateClusters(cluster)

    
    #if DEBUG:
    print "Cluster result:\n",clusterlist

    threshVarCap=0.01

    #np.set_printoptions(formatter={'float': lambda x: "{0:0.0f}".format(x)})
    
    for i in range(max(cluster)+1):
        a,b = extractRelations(clusterlist[i],inpMat,threshVarCap)
        print "Iteration:\t\t\t{}".format(i)
        print a,b    #print "For",i,"th cluster Covariance Residual Matrix:\n",b#*1000000,"\n"
            #print "relationEvectors\n",a