Source code for kClusterLib.kcTools

# -*- coding: utf-8 -*-
""" Some klustering related modules

This modeule is a collection of many useful functions.

.. contents:: 
    :depth: 3

"""



import numpy as np
import re
import os
import sys
import time
from time import strftime
import datetime

import Pycluster

DEBUG = False#True

[docs]def centerVectors(sensor_vectors ):
    """
    Applies Mean centering (and scaling too..) to SPND data.

    :math:`\\tilde{s_{ij}} = \\frac{ s_{ij} - \\bar{s} }{ \sqrt {\sigma}}`

    Parameters
    ----------
    sensor_vectors : numpy.ndarray
        SPND data matrix. SPNDs along columns.
    
    Returns
    -------
    centered_array : numpy.ndarray
        Matrix(SPNDs along column) with elements centered and scaled.
    mean_vector : list
        A row vector containing the sensor means.
    sqrt_var_vector: list
        A row vector containing the :math:`\\sqrt{\\sigma^2}` for each sensor.
    
    
   
    
    """
        
    t1 = time.time()    
    
    # zero elements count
    # zc = (sensor_vectors == 0).sum()

    # lets CLEAN data
    # sensor_vectors= custom_filter(sensor_vectors)
    
    # if DEBUG:
    #    print("{} enteries were cleaned".format((sensor_vectors == 0).sum() - zc))
    
    # dimensions of RAW matrix
    r, c = sensor_vectors.shape

    # Container for result. COPY of original data
    centered_array = np.array(sensor_vectors,copy=True)

    # Create a 1xN vector to hold mean of each SPND ch    
    manual_mean_vector = [0 for i in range(c)  ] # 1-D vectors are to be stored inside list instead of ndarray
    
    # Create a 1xN vector to hold var of each SPND ch    
    manual_var_vector = [0 for i in range(c) ]# 1-D vectors are to be stored inside list instead of ndarray
    # Now lets calculate ( and display) follows:
            # "numpy_mean"
            # manually calculated mean
            # "numpy variance", 
            # manually calculated variance

        
    ##***********************************
    # In loop below we take a single SPND ch and then
    # perform computation( mean and variance) on them
    # 
    # NOTE: Original data matrix is not modified. We are working on a copy!
    #       (?) What about the faulty values [-ve, 0, nan] (?)
    #       Filtering will take care of them
    ##***********************************
    
    # Initial 3 columns are  SERIAL, DATE, TIME 
    # SPND data strats from 4th column
    index = 0
    for column in xrange(c):
        # Lets extract column vectors and center them around mean

        # Select entire column
        sliced_vector = sensor_vectors[:,index]
        
        
        # manual_mean =  np.sum(sliced_vector)/float(r)

        # print ("manual mean is: {}".format(manual_mean))
        auto_mean = np.mean( sliced_vector)
        if DEBUG:
            print ("    np mean is: {}".format(auto_mean))
        # if DEBUG:
        #     print ("Processing SPND_Ch:{}, mean_E_delta:{}".format(column, \
                                                    # manual_mean - auto_mean))
        #if (condition):
        manual_mean_vector[ index] = auto_mean

        ##
        #  var = E( A_i - A_mean )^2
        #        -------------------
        #               N
        ##
        # manual_var = np.sum( \
        #              np.square( \
        #                 sliced_vector - manual_mean ) \
        #               )/float(r)
        # print manual_var
        auto_var = np.var( sliced_vector,dtype=np.float64)
        # if DEBUG:
        #     print ("                         var_E_delta:{}".format(manual_var - auto_var))

        manual_var_vector[ index] = auto_var

        index = index + 1 # move to next vector channel

    #Lets subtract means and divide by variance
    index = 0
    for col in xrange(c):
        centered_array[:,index] = centered_array[:,index] - manual_mean_vector[index]
        
        
        if DEBUG:
            print ("Mean centering residue for col:{} is {}".format( col, np.mean(centered_array[:,index])))

        divisor = np.sqrt(manual_var_vector[index])

        ####### WHAT to DO if denominator i.e. SD is zero (?)
        if count_nan(centered_array[:,index])  or (divisor==0) or np.isnan( divisor ):
            centered_array[:,index] = 0
            print("\r\nUnexpected Halt. \r\nEncountered zero variance while scaling ")
            halt()
        else:
            centered_array[:,index] = centered_array[:,index] / divisor

        if count_nan(centered_array[:,index]) > 0:
            #centered_array[:,index] = 0
            print centered_array[:,index]
            print np.sqrt(manual_var_vector[index])
            print ("{} {} nan's found".format(col,count_nan(centered_array[:,index])))
            centered_array[:,index] = 0
        index = index + 1 # move to next vector channel
    
    # if DEBUG:
    #     print ("quick check for mean centering error:{}".format(np.mean(centered_array)))
    ##***********************************************************
    #   k-means Clustering
    #   
    #   Note: Passing seeds is not allowed ( checked from documentation)
    #         
    #         Passing previous clustering information is allowed
    #         
    ##***********************************************************

    
    #print sensor_vectors
    # Achieved vectorization    
    #sensor_mat = np.fromiter(con.fetchall(), count=res_count-1, dtype=[('',np.float32)]*45)           
    #print column_names
    #print("mat:{}, amean:{}, mmean:{}, avar:{}, mvar:{}".format(id(sensor_vectors),id(),id(),id()))
    if DEBUG:
        print("::: Module executed in {} seconds, status 0 :::".format(time.time() - t1))
    
    return centered_array, manual_mean_vector, manual_var_vector   #, time.time() - t1


[docs]def getCentroids(mat, clusters):    
    '''
    Gives Centroids of clusters. Requires data matrix and cluster map.

    .. note::

        This function is obsolete now. Use `Pycluster.clustercentroids` instead.

    '''
    r,c = mat.shape

    total_clusters = np.max(clusters) + 1

    cluster_means = np.zeros(shape=(r, total_clusters))
    
    for spndj in range(c): 
        clusterId = clusters[spndj] # element tells us cluster-id
        cluster_means[:,clusterId] += mat[:, spndj]

    ## lets check the frequency of clusters
    cluster_freq = np.bincount(clusters,minlength=(total_clusters))

    ## We need to divide the sum by frequency
    for i in xrange(total_clusters):
        cluster_means[:,i] = cluster_means[:,i]/cluster_freq[i]
    
    return cluster_means


[docs]def separateClusters(kcluster, labels=[]):
    """
    Returns a list of lists containing separated clusters i.e. SPND's 
    that belong to same cluster are put together in a list. As many 
    list's as there are clusters.
    

    Parameters
    ----------
    kcluster : list 
        Takes a 1-D cluster list representing {SPND <--> Cluster} mapping.
    
    Returns
    -------
    cluster_result : list (of lists)
        A list containing members list which contain the integer indexes of 
        SPNDs belonging to a cluster

    Example
    -------
    >>> from kcTools import separateClusters
    >>> clusters = [ 2, 0, 1, 3, 2, 0, 1, 3,  2, 0, 1, 3]
    >>> print separateClusters(clusters)
    >>> ...
    [[1, 5, 9], [2, 6, 10], [0, 4, 8], [3, 7, 11]]
    

    """    
    num_of_clusters = max(kcluster) + 1

    cluster_result = [] # to hold our data

    
    # Create a container
    for index in range( num_of_clusters ):
        cluster_result.append([])
    
    # Fill the container
    for SPND in range( len( kcluster )):        
        cluster_result[kcluster[SPND]].append( SPND ) 

    # index = 0
    # for clst in cluster_result:
    #     print ("{}: {}".format(index,clst))
    #     index += 1
    # return None
    return cluster_result


[docs]def prettyPrint(kcluster, labels=[]):
    """
    Prints human readable format from cluster data.

    Parameters
    ----------
    kcluster : list 
        cluster list representing SPNDs to Cluster mapping.
    labels : list, optional
        list containing string names of SPNDs. If no list is passed automatic 
        numbering is used.

    Returns
    -------
    Displays output on stdout.

    Example
    -------

    """
    
    num_of_clusters = max(kcluster) + 1


    if not labels:
        labels = [str(i) for i in range(len(kcluster))]
    
    if DEBUG:
        print "B:", len(labels),len(kcluster)
    

    if not (len( kcluster ) == len( labels )):
        print("Mis-match in number of SPNDs and label count. Exiting!")
        exit()

    cluster_result = [] # to hold our data

    
    
    for index in range( num_of_clusters ):
        cluster_result.append([])
    
    
    
    for idx in range( len( kcluster )):
        
        cluster_result[kcluster[idx]].append( labels[idx] ) 

    index = 0
    for clst in cluster_result:
        print ("{}: {}".format(index,clst))
        index += 1
    return None


[docs]def getInterClusterS(mat, clusters):
    '''
    Calculates the sum of intercluster distances.

    Parameters
    ----------
    mat : numpy.ndarrray
        SPND data used for clustering.
    clusters : list
        cluster map of SPND. 

    Returns
    -------
    interClusterDistance : float
        Sum of cluster means from global mean.

    Example
    -------

    '''
    Tn, SPNDn = mat.shape
    
    if DEBUG:
        print("{} records, {} sensors".format(Tn, SPNDn))

    # Get individual channel means
    global_mean_vector = np.zeros(shape=(Tn,1))

    
    for Ti in range(Tn):        
        global_mean_vector[Ti,0]=sum(mat[Ti,:])/SPNDn 

    
    
    
    # Get cluster means   !!!   Q: What If a cluster is empty?
    #                           A: It seems the kcluster function doesn't
    #                                produce an empty cluster.
    total_clusters = np.max(clusters) + 1
    if DEBUG:   
        print ("{} clusters found".format(total_clusters))

    # cluster_means = np.zeros(shape=(Tn,total_clusters))
    
    
    
    #list_cluster = list(clusters)
    #print("clusters: {}".format(clusters))
    #print("clusters_list: {}".format(list_cluster))
    
    
    ##**********************************************
    # Cluster mean calculation
    # ------------------------
    #   We already have the means of each spnd in means_vector
    #   Now we will combine those spnd means which belong to same cluster
    #   
    ##**********************************************
    
    # index = 0 # index is basically the current spnd_number
    # for element in clusters: # element tells us cluster number
    #     cluster_means[:,element] += mat[:, index]
    #     index += 1
    r,c = mat.shape
    # for spndj in range(c): 
    #     clusterId = clusters[spndj] # element tells us cluster-id
    #     cluster_means[:,clusterId] += mat[:, spndj]
    
    # ## lets check the frequency of clusters
    cluster_freq = np.bincount(clusters,minlength=(total_clusters))
    # if DEBUG:
    #     print cluster_freq
    #     print np.sum(cluster_freq) # should be equal to spnd count

    # ## We need to divide the summed 
    
    # for i in xrange(total_clusters):
    #     cluster_means[:,i] = cluster_means[:,i]/cluster_freq[i]
    # if DEBUG:
    #     print("Cluster centroids:{}".format( cluster_means ))
    
    #cluster_means = getCentroids(mat, clusters) # Manual Method 
    cluster_means, mm = Pycluster.clustercentroids( mat, None, clusters, 'a',1)

    # Now we have individual cluster means
    # We can calculate the cluster distances from global centroid


    cluster_distances = np.zeros(shape=(1,total_clusters))
    for i in xrange(total_clusters):
        #cluster_distances[0,i] = 1-abs(np.corrcoef(cluster_means[:,i],global_mean_vector))
        #print "debug step"
        #print cluster_means[:,i].shape
        #print global_mean_vector[:,0].shape
        corrMat = abs(np.corrcoef(cluster_means[:,i],global_mean_vector[:,0]))
        if DEBUG:
            print("Correlation MAT 0:\r\n{}".format(corrMat))
        
        corrMat = 1 - corrMat
        #print("Correlation MAT 1:\r\n{}".format(corrMat))
        cluster_distances[0,i] = corrMat[0,1]
    
    # scale the distances by number of SPND's in each cluster
    for i in xrange(total_clusters):
        cluster_distances[0,i] = cluster_distances[0,i] * cluster_freq[i]
    
    #exit()
    #print np.sum(cluster_distances)
    return np.sum(cluster_distances)


[docs]def count_nan(arr):
    """
    Counts occurances of numpy.nan in the passed data structure.

    Parameters
    ----------
    arr : numpy.ndarray, list 
        The dataset in which nan(s) is to be counted.

    Returns
    -------
    count : int
        number of occurances of nan within the data passed.

    Example
    -------

    """
    mask = np.ones_like(arr)
    mask[np.isnan(arr)]=0
    
    return (mask==0).sum()


[docs]def custom_filter(mat):
    """ 
    Converts negative elements to 0 (zero) in the passed data.

    Parameters
    ----------
    mat : numpy.ndarray
        matrix to be cleaned of -ve values

    Returns
    -------
    cleanMat : numpy.ndarray
        matrix with negative elements converted to zero

    """
    mask = np.ones_like(mat)
    
    mat = np.nan_to_num(mat)
    mask[ mat < 0 ]=0
    
    # if DEBUG:
    #     print mask

    return np.multiply(mat,mask) #element wise MULTIPLY 


[docs]def removeFaultySensors(matSensor, labelSensor, minSensorOutput=0, allowedFault=25):
    """
    Eliminates those SPNDs which have x%% bad sensor readings, x is provided by user.

    Parameters
    ----------
    matSensor : numpy.ndarray
        Data to be filtered.
    labelSensor : list
        list of SPND labels.
    minSensorOutput : float
        sensor faulty value(output) or `bad` value threshold. If a sensor 
        value is below this value then it is considered a `bad` value.
    allowedFault : float
        %% threshold for total bad values. If a SPND has these many (or 
        more) %% of `bad` readings then it is eliminated. Corresponding 
        label is also removed from the list of labels

    Returns
    -------
    matSensor : numpy.ndarray
        SPND data after removval of faulty SPND column(s).
    labelSensor
        label list after removal of faulty SPND label(s).


    """ 
    r,c = matSensor.shape
    
    total_samples = float(r)
    faultCount=0
        
    maskFaultOne = [1  for i in range(c)]
    
    for i in range(c): #For each Sensor>>>     
        faultCount=0
        sliceArr = matSensor[:,i]
    
        faultCount = (sliceArr <= minSensorOutput).sum() #How many faults
            
        if faultCount >= (allowedFault*total_samples)/100:
            print("Faulty Channel:{} %% fault is {}".format(labelSensor[i],(faultCount*100)/total_samples))
            maskFaultOne[i]=0   # Marked the Sensor on mask
                
    if (np.array(maskFaultOne) == 0).sum(): # If mask indicates faulty sensors
        
        boolFMask = np.array(maskFaultOne, dtype=bool)
        
        rr,cc =  matSensor.shape
    
        matSensor = matSensor[:,boolFMask]  #Cleaning
        
        rrr,ccc = matSensor.shape
        
            ## We also need to delete the labels corresponding to faulty SPNDs
        lbl = np.array(labelSensor)
        labelSensor = lbl[boolFMask]
        print("{} faulty channels in Sensor data".format(cc-ccc))
    else:
        print "No faulty channels in Sensor Data"
 
    return matSensor, labelSensor


[docs]def getOptimalCluster(sensor_mat, Si_threshold=0.1, Kmax=8, **kwargs):
    
    """
    Does clustering multiple times and tries to find the optimal cluster.

    Parameters
    ----------
    sensor_mat : numpy.ndarray
        SPND data matrix.
    Si_threshold : int
        Desired ratio of Intra to Inter Cluster distance. This value is a 
        measure of `closeness` of the similar SPNDs in a cluster. 
    Kmax : int
        Upper limit on number of clusters. The clustering program starts 
        clustering from k=2  (two clusters) and then keeps increases k to 
        find the smaller Si.

    Returns
    -------
    kcluster : list
        A list containing the cluster mapping of SPNDs. The index corresponds 
        to SPND and value corresponds to Cluster. 
        Eg ``kcluster = [2,0,1]`` signifies 
        
            | :math:`0^{th} spnd :\\to cluster 2`
            | :math:`1^{st} spnd :\\to cluster 0`
            | :math:`2^{nd} spnd :\\to cluster 1`

    error : float
        Represents the sum of intra cluster distances.
    freq : int
        Represents how many times the optimal solution was found while clustering.

    Example
    -------

        >>> from pylab import randn
        >>> import numpy
        >>> from kcTools import getoptimalCluster, prettyPrint
        >>> #Lets create a random matrix
        >>> a = numpy.array(randn(100)).reshape(10,10)
        >>> #perform clustering
        >>> kc, er, fc = getoptimalCluster(a,0.5,5)
        >>> prettyPrint(kc,['a','b','c','d','e','f','g','h','i','j'])
        0: ['d', 'h']
        1: ['a', 'g']
        2: ['b', 'f', 'i']
        3: ['c', 'e', 'j']
        

    
    ...
    

    """

             
    if 'npass' in kwargs:
        npass = kwargs['npass']
    else:
        npass = 100

    S_w = 0 
    S_b = 0
    print "\r\n",datetime.datetime.now(),"npass = ",npass,"\r\n"
    for k in range(2,Kmax):
        kcluster, error, freq = Pycluster.kcluster(sensor_mat,k,None,None,1,npass,'a','a',None)
        S_w = error
        S_b = getInterClusterS(sensor_mat, kcluster)
        ### Can S_b be zero ?
        S_i = (S_w / S_b)

        print("for k:{}, Si:{}, Emin:{}".format(k,S_i,error))
        if S_i <= Si_threshold:
            break

    return kcluster, error, freq

[docs]def loadPcaData(**kwargs): # wrapper
    """
    Returns relMatrix and residueCovMat.
    
    PCA model data.    
    This function searches for the stored data in current directory or some other directory 
    (based on `cpath` arguments) 

    Parameters
    ----------
    fname : str, optional
        String Name of cluster to load from current dir

    cpath : str, optional
        System path of directory where cluster is stored.    

    Returns
    -------
    relMatrix: ndarray
        PCA relations Matrix
    residueCovMat : ndarray
        Data-Covariance matrix

    .. note: order of arguments is to be tracked carefully.


    """
    global DEBUG

    if 'fname' in kwargs:
        fn = kwargs['fname']
    else:
        kwargs['pca']=1
        fn = getFreshCluster(**kwargs) 

    if DEBUG and fn:
        print ("Most recent file:{}".format(fn))

    if not fn:
        return None

    with np.load(fn) as data:
        res = data['kc']
    # print res
    if not (len(res) == 5):
        print "Stored cluster data is not valid. [old cluster]"
        exit()
    return res[0], res[1]#, res[2], res[3], res[4]


[docs]def savePcaData( relMat, residueCovMat, **kwargs):
    """
    Saves a given cluster to memory.

    Parameters
    ----------
    relMatrix: ndarray
        PCA relations Matrix
    residueCovMat : ndarray
        Data-Covariance matrix

    Returns
    -------
    name : str
        Returns ``<filename>.np`` if successfully written the data to disk/dir.

    
    ...
    """
    #saves cluster to memory
    global DEBUG
    
    if 'debug' in kwargs:
        DEBUG = kwargs['debug']
    else:
        DEBUG = False


    if 'cpath' in kwargs:
        cpath = kwargs['cpath']
    else:
        cpath = os.getcwd()
        if DEBUG:
            print("WARNING: No path provided! \nCluster will be stored in current dir:\n {}".format(cpath))

    status = os.path.isdir(cpath)
    if not status:
        print ("ERROR: {} : directory doesn't exist! [{}.py]".format(cpath, __name__))
        exit()

    if 'fname' in kwargs:
        fname = kwargs['fname']
    else:
        fname=None

    if fname:
        fn = cpath + fname
    else:
        if DEBUG:
            print("WARNING: No filename provided! Using timestamp as name.")
        n = strftime("%Y-%m-%d-%H-%M-%S")
        fn = 'pca-{}.np'.format(n)
        
    os.chdir(cpath)

    #here we add three data pieces to a single array
    temp = [clusterMap, CoVcombinedMat, spndLabels, spndMeans, spndVars]

    finalData = np.array(temp)

    with open( fn ,'w+') as f:
        np.savez(f, kc=finalData)

    return fn


[docs]def loadKCFromDisk(**kwargs): # wrapper
    """
    Returns `clusterMap`, `CoVcombinedMat`, `spndLabels`, `spndMeans`, `spndVars` from disk (saved data). 

    This function searches for the stored data in current directory or some other directory 
    (based on `cpath` arguments) 

    Parameters
    ----------
    fname : str, optional
        String Name of cluster to load from current dir

    cpath : str, optional
        System path of directory where cluster is stored.    

    Returns
    -------
    clusterMap : list
        If a cluster  is located in the given path( or current directory) 
        else ``None`` is returned.
    CoVcombinedMat : numpy.ndarray
        matrix stored in memory
    spndLabels: list
        List of string names for SPNDs
    spndMeans: ndarray
        1-D numpy array containing Mean of each SPND sensor.
    spndVars: ndarray
        1-D numpy array containing Mean of each SPND sensor.

    Example
    -------
    Here's a use case of this function ::
    
    >>> from kcTools import *
    >>> from pylab import randn
    >>> import numpy as np 
    >>> mat = np.array(randn(1500)).reshape(150,10)
    >>> kc = np.array(range(6))
    >>> labels = ['a','b','c','d','e','f','g','h','i', 'j']
    >>> means = [np.mean(mat[:,i] for i in range(10))]
    >>> vars = [np.var(mat[:,i] for i in range(10))] 
    >>> saveKCToDisk(kc, mat, labels, means, vars)
    >>> clusterMap, CoVcombinedMat, spndLabels, spndMeans, spndVars = loadKCFromDisk()
    >>> print( "kc: {}\\r\\n\\r\\nmat:{} \\r\\n\\r\\nlbl:{} \\r\\n\\r\\nmeans:{} \\r\\n\\r\\nvars:{}".format(clusterMap, CoVcombinedMat, spndLabels, spndMeans, spndVars) )
    
    .. note: order of arguments is to be tracked carefully.

    """
    global DEBUG
    if 'fname' in kwargs:
        fn = kwargs['fname']
    else:
        fn = getFreshCluster(**kwargs) 

    if DEBUG and fn:
        print ("Most recent file:{}".format(fn))

    if not fn:
        return None

    with np.load(fn) as data:
        res = data['kc']
    # print res
    if not (len(res) == 5):
        print "Stored cluster data is not valid. [old cluster]"
        exit()
    return res[0], res[1], res[2], res[3], res[4]


[docs]def saveKCToDisk( clusterMap, CoVcombinedMat, spndLabels, spndMeans, spndVars, **kwargs):
    """
    Saves a given cluster to memory.

    Parameters
    ----------
    clusterMap: list 
        kcluster information to be saved to disk
    CoVcombinedMat: ndarray
        data used for clustering (mean centered and scaled)
    spndLabels: list
        String names for SPNDs
    spndMeans: ndarray
        1-D numpy array containing Mean of each SPND sensor.
    spndVars: ndarray
        1-D numpy array containing Mean of each SPND sensor.
    fname: str, optional
        Name of data file to be created and stored on dir.
    cpath : str, optional
        Valid system path where the cluster must be stored.

    Returns
    -------
    name : str
        Returns ``<filename>.np`` if successfully written the data to disk/dir.

    
    ...
    """
    #saves cluster to memory
    global DEBUG
    
    if 'debug' in kwargs:
        DEBUG = kwargs['debug']
    else:
        DEBUG = False


    if 'cpath' in kwargs:
        cpath = kwargs['cpath']
    else:
        cpath = os.getcwd()
        if DEBUG:
            print("WARNING: No path provided! \nCluster will be stored in current dir:\n {}".format(cpath))

    status = os.path.isdir(cpath)
    if not status:
        print ("ERROR: {} : directory doesn't exist! [{}.py]".format(cpath, __name__))
        exit()

    if 'fname' in kwargs:
        fname = kwargs['fname']
    else:
        fname=None

    if fname:
        fn = cpath + fname
    else:
        if DEBUG:
            print("WARNING: No filename provided! Using timestamp as name.")
        n = strftime("%Y-%m-%d-%H-%M-%S")
        fn = 'cluster-{}.np'.format(n)
        
    os.chdir(cpath)

    #here we add three data pieces to a single array
    temp = [clusterMap, CoVcombinedMat, spndLabels, spndMeans, spndVars]

    finalData = np.array(temp)

    with open( fn ,'w+') as f:
        np.savez(f, kc=finalData)

    return fn


[docs]def getFreshCluster(**kwargs):
    """
    Searches for a valid cluster on disk.

    .. note:: No need to use this function directly, use :func:`~kClusterLib.kcTools.loadKCFromDisk` 
        instead 
    """
    global DEBUG

    if 'debug' in kwargs:
        DEBUG = kwargs['debug']
    else:
        DEBUG = False


    if 'cpath' in kwargs:
        cpath = kwargs['cpath']
        print cpath
    else:
        if DEBUG:
            print ("`cpath` not passed. Default directory will be searched for clusters")
        cpath = "./"

    

    status = os.path.isdir(cpath)
    if not status:
        print ("ERROR: {} : directory doesn't exist! [{}.py]".format(cpath, __name__))
        exit()
    ## list_fn is the list of filenames
    fl = os.listdir(cpath)
    
    if 'pca' in kwargs:
        rex = re.compile('pca-\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-\d{2}')
    else:
        rex = re.compile('cluster-\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-\d{2}')

    fl = filter(lambda x: rex.match(x), fl)

    fl_bkp = fl

    

    if not fl:
        if DEBUG:
            print ("No cluster data found in path:\n{}".format(cpath))
        return None
    
    if DEBUG:
        print ("{} previously stored cluster(s):".format(len(fl)))
        for i in range(len(fl)):
            print fl[i]
    
    if len(fl) == 1 : # If there is only one clusterAvailable    
        return fl[0]      # we return the only cluster we found
    
    #Lets initialize our variable from first file's name
    ntime = get_datetime(fl[0])

    cursor = 0
    for i in range(1,len(fl)):
        if DEBUG:
            print("")
        ctime = get_datetime(fl[i])
        if ctime > ntime:
            ntime = ctime
            cursor = i

    return fl_bkp[cursor]


[docs]def get_datetime( fstr ):
    """
    Pass a valid DATE-Time string to this function and get a 
    valid ``datetime`` ``object``

    Parameters
    ----------
    fstr : str
        A valid datetime string

    Returns
    -------
    timestamp : datetime instance
        Valid datetime object. This allows us to check the freshness of 
        files or checking how mush time has elapsed since a perticular 
        file was created or


    
    """
    tokens = fstr[:-3].split('-')
    tok = [int(ss) for ss in tokens[1:]]
    return datetime.datetime(tok[0],tok[1],tok[2],tok[3],tok[4],tok[5])


[docs]def halt():
    """
    Halts the execution of program until user chooses to proceeed

    Parameters
    ----------
    None

    Returns
    -------
    None

    """

    while True:
        inp = raw_input("Press enter to continue or 'q' to exit: ")

        if not inp:
            print "Continuing..."
            break

        if len(inp)==1:
            if 'q' in  inp:
                exit()
                break
        
        print("\r\n Please provide a valid input!\r\n")


[docs]def isSingletonCluster(cluster):
    '''Check if a singleton cluster exists in the passed 1-D SPND-cluster 
    mapping array.
    
    Parameters
    ----------
    cluster : list
        1-D array of length( total number of SPNDs ) containing the SPND - cluster mapping.

    Returns
    -------
    status : bool
        Returns True if one or more singleton cluster labels occur in cluster. 

    
    Example
    -------
    
        >>> from kcTools import isSingletonCluster
        >>> clustr = [2,3,0,1,2,0,3,2,0]  # Here 1 is singleton  
        >>> print (isSingletonCluster(clustr))  # printing
        >>> clustr = [2,3,0,2,2,0,3,2,0]  # No singleton
        >>> print (isSingletonCluster(clustr))  # printing

    Produces Following output
    
    .. testoutput::

        True   
        False

    '''
    cluster_freq = np.bincount(cluster,minlength=(len(cluster)+1))
    
    if 1 in cluster_freq:
        return True
    # No single ton    
    return False


[docs]def mergeSingletonCluster( kcluster, mat, **kwargs):
    '''
    Merges the singleton cluster(if present) to nearest neighbour(S=1-abs(PC)) \
            in the 1-D SPND to cluster MAP. 
   
    Parameters
    ----------
    kcluster: list
        List containing the 1-D SPND to cluster-id Mapping.
    mat: ndarray
        Matrix like structure containing the SPND sensor values.

    Returns
    -------
    cluster: list
        Updated cluster mapping

    Example
    -------


    '''
    clusters = list(kcluster)

    DEBUG = False
    if 'debug' in kwargs:
        DEBUG = kwargs['debug']

    #check 1
    if not isSingletonCluster(clusters):
        return clusters

    #cid is the id of SPND present in singleton cluster
    cid = np.nan # Initially we assign it an invalid value

    # We have a compact list that maps `SPNDs to clusterID`. 
    # changing the above mapping to simple form
    # cluster = list containing SPND_IDs of member SPNDs
    clusterList = separateClusters(clusters)   
    
    # Finding out the SPND_ID of singleton SPND
    for c in clusterList:
        if len(c)==1:
            cid = clusterList.index(c)

    # In previous step did we obtain the singleton SPND ID(?)
    if np.isnan(cid):
        print "No singleton Found! Exiting"
        return clusters


    print "singleton CLUSTER id:",cid
    print "singleton SPND id:",clusterList[cid][-1]

    # print "List of Clusters",clusterList
    
    distances = []

    for c in range(len(clusterList)):
        if c == cid:
            distances.append(np.inf) # Intentionally making it high
        else:
            distance = Pycluster.clusterdistance(mat, None, None, clusterList[c], clusterList[cid], transpose=1, dist='a', method='a')
            #print " : ",distance
            distances.append(distance)
    
    if DEBUG:
        print "Cluster Distances:",distances
    
    smallDist = np.min(distances)
    neighbourNearest = distances.index(smallDist)
    
    if DEBUG:
        print "Nearest Neighbour Distance", smallDist
        print "Nearest Neighbour Cluster", neighbourNearest
    
    # Merging means we insert the lonely SPND to neighbour cluster (which is a list of SPNDS)

    singularSPND = clusters.index(cid)

    clusters[singularSPND] = neighbourNearest

    ## Now we need to take care of cid
    ##
    for j in range(cid+1,max(kcluster)+1):
        for i in range(len(clusters)):
            if clusters[i] == j:
                clusters[i] = j-1

    ## That was looooong
    
    if DEBUG:
        print "Merge Complete. Singleton cluster {} had SPND {}, New cluster Id for SPND {} is {}".format(cid,clusterList[cid][-1],clusterList[cid][-1],neighbourNearest)
        print "Old cluster map was : {}".format(kcluster)
        print "New cluster map is  : {}".format(clusters)
    return clusters