Source code for kClusterLib.kcTools

# -*- coding: utf-8 -*-
""" Some klustering related modules

This modeule is a collection of many useful functions.

.. contents:: 
    :depth: 3

"""



import numpy as np
import re
import os
import sys
import time
from time import strftime
import datetime

import Pycluster

DEBUG = False#True

[docs]def centerVectors(sensor_vectors ): """ Applies Mean centering (and scaling too..) to SPND data. :math:`\\tilde{s_{ij}} = \\frac{ s_{ij} - \\bar{s} }{ \sqrt {\sigma}}` Parameters ---------- sensor_vectors : numpy.ndarray SPND data matrix. SPNDs along columns. Returns ------- centered_array : numpy.ndarray Matrix(SPNDs along column) with elements centered and scaled. mean_vector : list A row vector containing the sensor means. sqrt_var_vector: list A row vector containing the :math:`\\sqrt{\\sigma^2}` for each sensor. """ t1 = time.time() # zero elements count # zc = (sensor_vectors == 0).sum() # lets CLEAN data # sensor_vectors= custom_filter(sensor_vectors) # if DEBUG: # print("{} enteries were cleaned".format((sensor_vectors == 0).sum() - zc)) # dimensions of RAW matrix r, c = sensor_vectors.shape # Container for result. COPY of original data centered_array = np.array(sensor_vectors,copy=True) # Create a 1xN vector to hold mean of each SPND ch manual_mean_vector = [0 for i in range(c) ] # 1-D vectors are to be stored inside list instead of ndarray # Create a 1xN vector to hold var of each SPND ch manual_var_vector = [0 for i in range(c) ]# 1-D vectors are to be stored inside list instead of ndarray # Now lets calculate ( and display) follows: # "numpy_mean" # manually calculated mean # "numpy variance", # manually calculated variance ##*********************************** # In loop below we take a single SPND ch and then # perform computation( mean and variance) on them # # NOTE: Original data matrix is not modified. We are working on a copy! # (?) What about the faulty values [-ve, 0, nan] (?) # Filtering will take care of them ##*********************************** # Initial 3 columns are SERIAL, DATE, TIME # SPND data strats from 4th column index = 0 for column in xrange(c): # Lets extract column vectors and center them around mean # Select entire column sliced_vector = sensor_vectors[:,index] # manual_mean = np.sum(sliced_vector)/float(r) # print ("manual mean is: {}".format(manual_mean)) auto_mean = np.mean( sliced_vector) if DEBUG: print (" np mean is: {}".format(auto_mean)) # if DEBUG: # print ("Processing SPND_Ch:{}, mean_E_delta:{}".format(column, \ # manual_mean - auto_mean)) #if (condition): manual_mean_vector[ index] = auto_mean ## # var = E( A_i - A_mean )^2 # ------------------- # N ## # manual_var = np.sum( \ # np.square( \ # sliced_vector - manual_mean ) \ # )/float(r) # print manual_var auto_var = np.var( sliced_vector,dtype=np.float64) # if DEBUG: # print (" var_E_delta:{}".format(manual_var - auto_var)) manual_var_vector[ index] = auto_var index = index + 1 # move to next vector channel #Lets subtract means and divide by variance index = 0 for col in xrange(c): centered_array[:,index] = centered_array[:,index] - manual_mean_vector[index] if DEBUG: print ("Mean centering residue for col:{} is {}".format( col, np.mean(centered_array[:,index]))) divisor = np.sqrt(manual_var_vector[index]) ####### WHAT to DO if denominator i.e. SD is zero (?) if count_nan(centered_array[:,index]) or (divisor==0) or np.isnan( divisor ): centered_array[:,index] = 0 print("\r\nUnexpected Halt. \r\nEncountered zero variance while scaling ") halt() else: centered_array[:,index] = centered_array[:,index] / divisor if count_nan(centered_array[:,index]) > 0: #centered_array[:,index] = 0 print centered_array[:,index] print np.sqrt(manual_var_vector[index]) print ("{} {} nan's found".format(col,count_nan(centered_array[:,index]))) centered_array[:,index] = 0 index = index + 1 # move to next vector channel # if DEBUG: # print ("quick check for mean centering error:{}".format(np.mean(centered_array))) ##*********************************************************** # k-means Clustering # # Note: Passing seeds is not allowed ( checked from documentation) # # Passing previous clustering information is allowed # ##*********************************************************** #print sensor_vectors # Achieved vectorization #sensor_mat = np.fromiter(con.fetchall(), count=res_count-1, dtype=[('',np.float32)]*45) #print column_names #print("mat:{}, amean:{}, mmean:{}, avar:{}, mvar:{}".format(id(sensor_vectors),id(),id(),id())) if DEBUG: print("::: Module executed in {} seconds, status 0 :::".format(time.time() - t1)) return centered_array, manual_mean_vector, manual_var_vector #, time.time() - t1
[docs]def getCentroids(mat, clusters): ''' Gives Centroids of clusters. Requires data matrix and cluster map. .. note:: This function is obsolete now. Use `Pycluster.clustercentroids` instead. ''' r,c = mat.shape total_clusters = np.max(clusters) + 1 cluster_means = np.zeros(shape=(r, total_clusters)) for spndj in range(c): clusterId = clusters[spndj] # element tells us cluster-id cluster_means[:,clusterId] += mat[:, spndj] ## lets check the frequency of clusters cluster_freq = np.bincount(clusters,minlength=(total_clusters)) ## We need to divide the sum by frequency for i in xrange(total_clusters): cluster_means[:,i] = cluster_means[:,i]/cluster_freq[i] return cluster_means
[docs]def separateClusters(kcluster, labels=[]): """ Returns a list of lists containing separated clusters i.e. SPND's that belong to same cluster are put together in a list. As many list's as there are clusters. Parameters ---------- kcluster : list Takes a 1-D cluster list representing {SPND <--> Cluster} mapping. Returns ------- cluster_result : list (of lists) A list containing members list which contain the integer indexes of SPNDs belonging to a cluster Example ------- >>> from kcTools import separateClusters >>> clusters = [ 2, 0, 1, 3, 2, 0, 1, 3, 2, 0, 1, 3] >>> print separateClusters(clusters) >>> ... [[1, 5, 9], [2, 6, 10], [0, 4, 8], [3, 7, 11]] """ num_of_clusters = max(kcluster) + 1 cluster_result = [] # to hold our data # Create a container for index in range( num_of_clusters ): cluster_result.append([]) # Fill the container for SPND in range( len( kcluster )): cluster_result[kcluster[SPND]].append( SPND ) # index = 0 # for clst in cluster_result: # print ("{}: {}".format(index,clst)) # index += 1 # return None return cluster_result
[docs]def prettyPrint(kcluster, labels=[]): """ Prints human readable format from cluster data. Parameters ---------- kcluster : list cluster list representing SPNDs to Cluster mapping. labels : list, optional list containing string names of SPNDs. If no list is passed automatic numbering is used. Returns ------- Displays output on stdout. Example ------- """ num_of_clusters = max(kcluster) + 1 if not labels: labels = [str(i) for i in range(len(kcluster))] if DEBUG: print "B:", len(labels),len(kcluster) if not (len( kcluster ) == len( labels )): print("Mis-match in number of SPNDs and label count. Exiting!") exit() cluster_result = [] # to hold our data for index in range( num_of_clusters ): cluster_result.append([]) for idx in range( len( kcluster )): cluster_result[kcluster[idx]].append( labels[idx] ) index = 0 for clst in cluster_result: print ("{}: {}".format(index,clst)) index += 1 return None
[docs]def getInterClusterS(mat, clusters): ''' Calculates the sum of intercluster distances. Parameters ---------- mat : numpy.ndarrray SPND data used for clustering. clusters : list cluster map of SPND. Returns ------- interClusterDistance : float Sum of cluster means from global mean. Example ------- ''' Tn, SPNDn = mat.shape if DEBUG: print("{} records, {} sensors".format(Tn, SPNDn)) # Get individual channel means global_mean_vector = np.zeros(shape=(Tn,1)) for Ti in range(Tn): global_mean_vector[Ti,0]=sum(mat[Ti,:])/SPNDn # Get cluster means !!! Q: What If a cluster is empty? # A: It seems the kcluster function doesn't # produce an empty cluster. total_clusters = np.max(clusters) + 1 if DEBUG: print ("{} clusters found".format(total_clusters)) # cluster_means = np.zeros(shape=(Tn,total_clusters)) #list_cluster = list(clusters) #print("clusters: {}".format(clusters)) #print("clusters_list: {}".format(list_cluster)) ##********************************************** # Cluster mean calculation # ------------------------ # We already have the means of each spnd in means_vector # Now we will combine those spnd means which belong to same cluster # ##********************************************** # index = 0 # index is basically the current spnd_number # for element in clusters: # element tells us cluster number # cluster_means[:,element] += mat[:, index] # index += 1 r,c = mat.shape # for spndj in range(c): # clusterId = clusters[spndj] # element tells us cluster-id # cluster_means[:,clusterId] += mat[:, spndj] # ## lets check the frequency of clusters cluster_freq = np.bincount(clusters,minlength=(total_clusters)) # if DEBUG: # print cluster_freq # print np.sum(cluster_freq) # should be equal to spnd count # ## We need to divide the summed # for i in xrange(total_clusters): # cluster_means[:,i] = cluster_means[:,i]/cluster_freq[i] # if DEBUG: # print("Cluster centroids:{}".format( cluster_means )) #cluster_means = getCentroids(mat, clusters) # Manual Method cluster_means, mm = Pycluster.clustercentroids( mat, None, clusters, 'a',1) # Now we have individual cluster means # We can calculate the cluster distances from global centroid cluster_distances = np.zeros(shape=(1,total_clusters)) for i in xrange(total_clusters): #cluster_distances[0,i] = 1-abs(np.corrcoef(cluster_means[:,i],global_mean_vector)) #print "debug step" #print cluster_means[:,i].shape #print global_mean_vector[:,0].shape corrMat = abs(np.corrcoef(cluster_means[:,i],global_mean_vector[:,0])) if DEBUG: print("Correlation MAT 0:\r\n{}".format(corrMat)) corrMat = 1 - corrMat #print("Correlation MAT 1:\r\n{}".format(corrMat)) cluster_distances[0,i] = corrMat[0,1] # scale the distances by number of SPND's in each cluster for i in xrange(total_clusters): cluster_distances[0,i] = cluster_distances[0,i] * cluster_freq[i] #exit() #print np.sum(cluster_distances) return np.sum(cluster_distances)
[docs]def count_nan(arr): """ Counts occurances of numpy.nan in the passed data structure. Parameters ---------- arr : numpy.ndarray, list The dataset in which nan(s) is to be counted. Returns ------- count : int number of occurances of nan within the data passed. Example ------- """ mask = np.ones_like(arr) mask[np.isnan(arr)]=0 return (mask==0).sum()
[docs]def custom_filter(mat): """ Converts negative elements to 0 (zero) in the passed data. Parameters ---------- mat : numpy.ndarray matrix to be cleaned of -ve values Returns ------- cleanMat : numpy.ndarray matrix with negative elements converted to zero """ mask = np.ones_like(mat) mat = np.nan_to_num(mat) mask[ mat < 0 ]=0 # if DEBUG: # print mask return np.multiply(mat,mask) #element wise MULTIPLY
[docs]def removeFaultySensors(matSensor, labelSensor, minSensorOutput=0, allowedFault=25): """ Eliminates those SPNDs which have x%% bad sensor readings, x is provided by user. Parameters ---------- matSensor : numpy.ndarray Data to be filtered. labelSensor : list list of SPND labels. minSensorOutput : float sensor faulty value(output) or `bad` value threshold. If a sensor value is below this value then it is considered a `bad` value. allowedFault : float %% threshold for total bad values. If a SPND has these many (or more) %% of `bad` readings then it is eliminated. Corresponding label is also removed from the list of labels Returns ------- matSensor : numpy.ndarray SPND data after removval of faulty SPND column(s). labelSensor label list after removal of faulty SPND label(s). """ r,c = matSensor.shape total_samples = float(r) faultCount=0 maskFaultOne = [1 for i in range(c)] for i in range(c): #For each Sensor>>> faultCount=0 sliceArr = matSensor[:,i] faultCount = (sliceArr <= minSensorOutput).sum() #How many faults if faultCount >= (allowedFault*total_samples)/100: print("Faulty Channel:{} %% fault is {}".format(labelSensor[i],(faultCount*100)/total_samples)) maskFaultOne[i]=0 # Marked the Sensor on mask if (np.array(maskFaultOne) == 0).sum(): # If mask indicates faulty sensors boolFMask = np.array(maskFaultOne, dtype=bool) rr,cc = matSensor.shape matSensor = matSensor[:,boolFMask] #Cleaning rrr,ccc = matSensor.shape ## We also need to delete the labels corresponding to faulty SPNDs lbl = np.array(labelSensor) labelSensor = lbl[boolFMask] print("{} faulty channels in Sensor data".format(cc-ccc)) else: print "No faulty channels in Sensor Data" return matSensor, labelSensor
[docs]def getOptimalCluster(sensor_mat, Si_threshold=0.1, Kmax=8, **kwargs): """ Does clustering multiple times and tries to find the optimal cluster. Parameters ---------- sensor_mat : numpy.ndarray SPND data matrix. Si_threshold : int Desired ratio of Intra to Inter Cluster distance. This value is a measure of `closeness` of the similar SPNDs in a cluster. Kmax : int Upper limit on number of clusters. The clustering program starts clustering from k=2 (two clusters) and then keeps increases k to find the smaller Si. Returns ------- kcluster : list A list containing the cluster mapping of SPNDs. The index corresponds to SPND and value corresponds to Cluster. Eg ``kcluster = [2,0,1]`` signifies | :math:`0^{th} spnd :\\to cluster 2` | :math:`1^{st} spnd :\\to cluster 0` | :math:`2^{nd} spnd :\\to cluster 1` error : float Represents the sum of intra cluster distances. freq : int Represents how many times the optimal solution was found while clustering. Example ------- >>> from pylab import randn >>> import numpy >>> from kcTools import getoptimalCluster, prettyPrint >>> #Lets create a random matrix >>> a = numpy.array(randn(100)).reshape(10,10) >>> #perform clustering >>> kc, er, fc = getoptimalCluster(a,0.5,5) >>> prettyPrint(kc,['a','b','c','d','e','f','g','h','i','j']) 0: ['d', 'h'] 1: ['a', 'g'] 2: ['b', 'f', 'i'] 3: ['c', 'e', 'j'] ... """ if 'npass' in kwargs: npass = kwargs['npass'] else: npass = 100 S_w = 0 S_b = 0 print "\r\n",datetime.datetime.now(),"npass = ",npass,"\r\n" for k in range(2,Kmax): kcluster, error, freq = Pycluster.kcluster(sensor_mat,k,None,None,1,npass,'a','a',None) S_w = error S_b = getInterClusterS(sensor_mat, kcluster) ### Can S_b be zero ? S_i = (S_w / S_b) print("for k:{}, Si:{}, Emin:{}".format(k,S_i,error)) if S_i <= Si_threshold: break return kcluster, error, freq
[docs]def loadPcaData(**kwargs): # wrapper """ Returns relMatrix and residueCovMat. PCA model data. This function searches for the stored data in current directory or some other directory (based on `cpath` arguments) Parameters ---------- fname : str, optional String Name of cluster to load from current dir cpath : str, optional System path of directory where cluster is stored. Returns ------- relMatrix: ndarray PCA relations Matrix residueCovMat : ndarray Data-Covariance matrix .. note: order of arguments is to be tracked carefully. """ global DEBUG if 'fname' in kwargs: fn = kwargs['fname'] else: kwargs['pca']=1 fn = getFreshCluster(**kwargs) if DEBUG and fn: print ("Most recent file:{}".format(fn)) if not fn: return None with np.load(fn) as data: res = data['kc'] # print res if not (len(res) == 5): print "Stored cluster data is not valid. [old cluster]" exit() return res[0], res[1]#, res[2], res[3], res[4]
[docs]def savePcaData( relMat, residueCovMat, **kwargs): """ Saves a given cluster to memory. Parameters ---------- relMatrix: ndarray PCA relations Matrix residueCovMat : ndarray Data-Covariance matrix Returns ------- name : str Returns ``<filename>.np`` if successfully written the data to disk/dir. ... """ #saves cluster to memory global DEBUG if 'debug' in kwargs: DEBUG = kwargs['debug'] else: DEBUG = False if 'cpath' in kwargs: cpath = kwargs['cpath'] else: cpath = os.getcwd() if DEBUG: print("WARNING: No path provided! \nCluster will be stored in current dir:\n {}".format(cpath)) status = os.path.isdir(cpath) if not status: print ("ERROR: {} : directory doesn't exist! [{}.py]".format(cpath, __name__)) exit() if 'fname' in kwargs: fname = kwargs['fname'] else: fname=None if fname: fn = cpath + fname else: if DEBUG: print("WARNING: No filename provided! Using timestamp as name.") n = strftime("%Y-%m-%d-%H-%M-%S") fn = 'pca-{}.np'.format(n) os.chdir(cpath) #here we add three data pieces to a single array temp = [clusterMap, CoVcombinedMat, spndLabels, spndMeans, spndVars] finalData = np.array(temp) with open( fn ,'w+') as f: np.savez(f, kc=finalData) return fn
[docs]def loadKCFromDisk(**kwargs): # wrapper """ Returns `clusterMap`, `CoVcombinedMat`, `spndLabels`, `spndMeans`, `spndVars` from disk (saved data). This function searches for the stored data in current directory or some other directory (based on `cpath` arguments) Parameters ---------- fname : str, optional String Name of cluster to load from current dir cpath : str, optional System path of directory where cluster is stored. Returns ------- clusterMap : list If a cluster is located in the given path( or current directory) else ``None`` is returned. CoVcombinedMat : numpy.ndarray matrix stored in memory spndLabels: list List of string names for SPNDs spndMeans: ndarray 1-D numpy array containing Mean of each SPND sensor. spndVars: ndarray 1-D numpy array containing Mean of each SPND sensor. Example ------- Here's a use case of this function :: >>> from kcTools import * >>> from pylab import randn >>> import numpy as np >>> mat = np.array(randn(1500)).reshape(150,10) >>> kc = np.array(range(6)) >>> labels = ['a','b','c','d','e','f','g','h','i', 'j'] >>> means = [np.mean(mat[:,i] for i in range(10))] >>> vars = [np.var(mat[:,i] for i in range(10))] >>> saveKCToDisk(kc, mat, labels, means, vars) >>> clusterMap, CoVcombinedMat, spndLabels, spndMeans, spndVars = loadKCFromDisk() >>> print( "kc: {}\\r\\n\\r\\nmat:{} \\r\\n\\r\\nlbl:{} \\r\\n\\r\\nmeans:{} \\r\\n\\r\\nvars:{}".format(clusterMap, CoVcombinedMat, spndLabels, spndMeans, spndVars) ) .. note: order of arguments is to be tracked carefully. """ global DEBUG if 'fname' in kwargs: fn = kwargs['fname'] else: fn = getFreshCluster(**kwargs) if DEBUG and fn: print ("Most recent file:{}".format(fn)) if not fn: return None with np.load(fn) as data: res = data['kc'] # print res if not (len(res) == 5): print "Stored cluster data is not valid. [old cluster]" exit() return res[0], res[1], res[2], res[3], res[4]
[docs]def saveKCToDisk( clusterMap, CoVcombinedMat, spndLabels, spndMeans, spndVars, **kwargs): """ Saves a given cluster to memory. Parameters ---------- clusterMap: list kcluster information to be saved to disk CoVcombinedMat: ndarray data used for clustering (mean centered and scaled) spndLabels: list String names for SPNDs spndMeans: ndarray 1-D numpy array containing Mean of each SPND sensor. spndVars: ndarray 1-D numpy array containing Mean of each SPND sensor. fname: str, optional Name of data file to be created and stored on dir. cpath : str, optional Valid system path where the cluster must be stored. Returns ------- name : str Returns ``<filename>.np`` if successfully written the data to disk/dir. ... """ #saves cluster to memory global DEBUG if 'debug' in kwargs: DEBUG = kwargs['debug'] else: DEBUG = False if 'cpath' in kwargs: cpath = kwargs['cpath'] else: cpath = os.getcwd() if DEBUG: print("WARNING: No path provided! \nCluster will be stored in current dir:\n {}".format(cpath)) status = os.path.isdir(cpath) if not status: print ("ERROR: {} : directory doesn't exist! [{}.py]".format(cpath, __name__)) exit() if 'fname' in kwargs: fname = kwargs['fname'] else: fname=None if fname: fn = cpath + fname else: if DEBUG: print("WARNING: No filename provided! Using timestamp as name.") n = strftime("%Y-%m-%d-%H-%M-%S") fn = 'cluster-{}.np'.format(n) os.chdir(cpath) #here we add three data pieces to a single array temp = [clusterMap, CoVcombinedMat, spndLabels, spndMeans, spndVars] finalData = np.array(temp) with open( fn ,'w+') as f: np.savez(f, kc=finalData) return fn
[docs]def getFreshCluster(**kwargs): """ Searches for a valid cluster on disk. .. note:: No need to use this function directly, use :func:`~kClusterLib.kcTools.loadKCFromDisk` instead """ global DEBUG if 'debug' in kwargs: DEBUG = kwargs['debug'] else: DEBUG = False if 'cpath' in kwargs: cpath = kwargs['cpath'] print cpath else: if DEBUG: print ("`cpath` not passed. Default directory will be searched for clusters") cpath = "./" status = os.path.isdir(cpath) if not status: print ("ERROR: {} : directory doesn't exist! [{}.py]".format(cpath, __name__)) exit() ## list_fn is the list of filenames fl = os.listdir(cpath) if 'pca' in kwargs: rex = re.compile('pca-\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-\d{2}') else: rex = re.compile('cluster-\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-\d{2}') fl = filter(lambda x: rex.match(x), fl) fl_bkp = fl if not fl: if DEBUG: print ("No cluster data found in path:\n{}".format(cpath)) return None if DEBUG: print ("{} previously stored cluster(s):".format(len(fl))) for i in range(len(fl)): print fl[i] if len(fl) == 1 : # If there is only one clusterAvailable return fl[0] # we return the only cluster we found #Lets initialize our variable from first file's name ntime = get_datetime(fl[0]) cursor = 0 for i in range(1,len(fl)): if DEBUG: print("") ctime = get_datetime(fl[i]) if ctime > ntime: ntime = ctime cursor = i return fl_bkp[cursor]
[docs]def get_datetime( fstr ): """ Pass a valid DATE-Time string to this function and get a valid ``datetime`` ``object`` Parameters ---------- fstr : str A valid datetime string Returns ------- timestamp : datetime instance Valid datetime object. This allows us to check the freshness of files or checking how mush time has elapsed since a perticular file was created or """ tokens = fstr[:-3].split('-') tok = [int(ss) for ss in tokens[1:]] return datetime.datetime(tok[0],tok[1],tok[2],tok[3],tok[4],tok[5])
[docs]def halt(): """ Halts the execution of program until user chooses to proceeed Parameters ---------- None Returns ------- None """ while True: inp = raw_input("Press enter to continue or 'q' to exit: ") if not inp: print "Continuing..." break if len(inp)==1: if 'q' in inp: exit() break print("\r\n Please provide a valid input!\r\n")
[docs]def isSingletonCluster(cluster): '''Check if a singleton cluster exists in the passed 1-D SPND-cluster mapping array. Parameters ---------- cluster : list 1-D array of length( total number of SPNDs ) containing the SPND - cluster mapping. Returns ------- status : bool Returns True if one or more singleton cluster labels occur in cluster. Example ------- >>> from kcTools import isSingletonCluster >>> clustr = [2,3,0,1,2,0,3,2,0] # Here 1 is singleton >>> print (isSingletonCluster(clustr)) # printing >>> clustr = [2,3,0,2,2,0,3,2,0] # No singleton >>> print (isSingletonCluster(clustr)) # printing Produces Following output .. testoutput:: True False ''' cluster_freq = np.bincount(cluster,minlength=(len(cluster)+1)) if 1 in cluster_freq: return True # No single ton return False
[docs]def mergeSingletonCluster( kcluster, mat, **kwargs): ''' Merges the singleton cluster(if present) to nearest neighbour(S=1-abs(PC)) \ in the 1-D SPND to cluster MAP. Parameters ---------- kcluster: list List containing the 1-D SPND to cluster-id Mapping. mat: ndarray Matrix like structure containing the SPND sensor values. Returns ------- cluster: list Updated cluster mapping Example ------- ''' clusters = list(kcluster) DEBUG = False if 'debug' in kwargs: DEBUG = kwargs['debug'] #check 1 if not isSingletonCluster(clusters): return clusters #cid is the id of SPND present in singleton cluster cid = np.nan # Initially we assign it an invalid value # We have a compact list that maps `SPNDs to clusterID`. # changing the above mapping to simple form # cluster = list containing SPND_IDs of member SPNDs clusterList = separateClusters(clusters) # Finding out the SPND_ID of singleton SPND for c in clusterList: if len(c)==1: cid = clusterList.index(c) # In previous step did we obtain the singleton SPND ID(?) if np.isnan(cid): print "No singleton Found! Exiting" return clusters print "singleton CLUSTER id:",cid print "singleton SPND id:",clusterList[cid][-1] # print "List of Clusters",clusterList distances = [] for c in range(len(clusterList)): if c == cid: distances.append(np.inf) # Intentionally making it high else: distance = Pycluster.clusterdistance(mat, None, None, clusterList[c], clusterList[cid], transpose=1, dist='a', method='a') #print " : ",distance distances.append(distance) if DEBUG: print "Cluster Distances:",distances smallDist = np.min(distances) neighbourNearest = distances.index(smallDist) if DEBUG: print "Nearest Neighbour Distance", smallDist print "Nearest Neighbour Cluster", neighbourNearest # Merging means we insert the lonely SPND to neighbour cluster (which is a list of SPNDS) singularSPND = clusters.index(cid) clusters[singularSPND] = neighbourNearest ## Now we need to take care of cid ## for j in range(cid+1,max(kcluster)+1): for i in range(len(clusters)): if clusters[i] == j: clusters[i] = j-1 ## That was looooong if DEBUG: print "Merge Complete. Singleton cluster {} had SPND {}, New cluster Id for SPND {} is {}".format(cid,clusterList[cid][-1],clusterList[cid][-1],neighbourNearest) print "Old cluster map was : {}".format(kcluster) print "New cluster map is : {}".format(clusters) return clusters