# -*- coding: utf-8 -*-
""" Some klustering related modules
This modeule is a collection of many useful functions.
.. contents::
:depth: 3
"""
import numpy as np
import re
import os
import sys
import time
from time import strftime
import datetime
import Pycluster
DEBUG = False#True
[docs]def centerVectors(sensor_vectors ):
"""
Applies Mean centering (and scaling too..) to SPND data.
:math:`\\tilde{s_{ij}} = \\frac{ s_{ij} - \\bar{s} }{ \sqrt {\sigma}}`
Parameters
----------
sensor_vectors : numpy.ndarray
SPND data matrix. SPNDs along columns.
Returns
-------
centered_array : numpy.ndarray
Matrix(SPNDs along column) with elements centered and scaled.
mean_vector : list
A row vector containing the sensor means.
sqrt_var_vector: list
A row vector containing the :math:`\\sqrt{\\sigma^2}` for each sensor.
"""
t1 = time.time()
# zero elements count
# zc = (sensor_vectors == 0).sum()
# lets CLEAN data
# sensor_vectors= custom_filter(sensor_vectors)
# if DEBUG:
# print("{} enteries were cleaned".format((sensor_vectors == 0).sum() - zc))
# dimensions of RAW matrix
r, c = sensor_vectors.shape
# Container for result. COPY of original data
centered_array = np.array(sensor_vectors,copy=True)
# Create a 1xN vector to hold mean of each SPND ch
manual_mean_vector = [0 for i in range(c) ] # 1-D vectors are to be stored inside list instead of ndarray
# Create a 1xN vector to hold var of each SPND ch
manual_var_vector = [0 for i in range(c) ]# 1-D vectors are to be stored inside list instead of ndarray
# Now lets calculate ( and display) follows:
# "numpy_mean"
# manually calculated mean
# "numpy variance",
# manually calculated variance
##***********************************
# In loop below we take a single SPND ch and then
# perform computation( mean and variance) on them
#
# NOTE: Original data matrix is not modified. We are working on a copy!
# (?) What about the faulty values [-ve, 0, nan] (?)
# Filtering will take care of them
##***********************************
# Initial 3 columns are SERIAL, DATE, TIME
# SPND data strats from 4th column
index = 0
for column in xrange(c):
# Lets extract column vectors and center them around mean
# Select entire column
sliced_vector = sensor_vectors[:,index]
# manual_mean = np.sum(sliced_vector)/float(r)
# print ("manual mean is: {}".format(manual_mean))
auto_mean = np.mean( sliced_vector)
if DEBUG:
print (" np mean is: {}".format(auto_mean))
# if DEBUG:
# print ("Processing SPND_Ch:{}, mean_E_delta:{}".format(column, \
# manual_mean - auto_mean))
#if (condition):
manual_mean_vector[ index] = auto_mean
##
# var = E( A_i - A_mean )^2
# -------------------
# N
##
# manual_var = np.sum( \
# np.square( \
# sliced_vector - manual_mean ) \
# )/float(r)
# print manual_var
auto_var = np.var( sliced_vector,dtype=np.float64)
# if DEBUG:
# print (" var_E_delta:{}".format(manual_var - auto_var))
manual_var_vector[ index] = auto_var
index = index + 1 # move to next vector channel
#Lets subtract means and divide by variance
index = 0
for col in xrange(c):
centered_array[:,index] = centered_array[:,index] - manual_mean_vector[index]
if DEBUG:
print ("Mean centering residue for col:{} is {}".format( col, np.mean(centered_array[:,index])))
divisor = np.sqrt(manual_var_vector[index])
####### WHAT to DO if denominator i.e. SD is zero (?)
if count_nan(centered_array[:,index]) or (divisor==0) or np.isnan( divisor ):
centered_array[:,index] = 0
print("\r\nUnexpected Halt. \r\nEncountered zero variance while scaling ")
halt()
else:
centered_array[:,index] = centered_array[:,index] / divisor
if count_nan(centered_array[:,index]) > 0:
#centered_array[:,index] = 0
print centered_array[:,index]
print np.sqrt(manual_var_vector[index])
print ("{} {} nan's found".format(col,count_nan(centered_array[:,index])))
centered_array[:,index] = 0
index = index + 1 # move to next vector channel
# if DEBUG:
# print ("quick check for mean centering error:{}".format(np.mean(centered_array)))
##***********************************************************
# k-means Clustering
#
# Note: Passing seeds is not allowed ( checked from documentation)
#
# Passing previous clustering information is allowed
#
##***********************************************************
#print sensor_vectors
# Achieved vectorization
#sensor_mat = np.fromiter(con.fetchall(), count=res_count-1, dtype=[('',np.float32)]*45)
#print column_names
#print("mat:{}, amean:{}, mmean:{}, avar:{}, mvar:{}".format(id(sensor_vectors),id(),id(),id()))
if DEBUG:
print("::: Module executed in {} seconds, status 0 :::".format(time.time() - t1))
return centered_array, manual_mean_vector, manual_var_vector #, time.time() - t1
[docs]def getCentroids(mat, clusters):
'''
Gives Centroids of clusters. Requires data matrix and cluster map.
.. note::
This function is obsolete now. Use `Pycluster.clustercentroids` instead.
'''
r,c = mat.shape
total_clusters = np.max(clusters) + 1
cluster_means = np.zeros(shape=(r, total_clusters))
for spndj in range(c):
clusterId = clusters[spndj] # element tells us cluster-id
cluster_means[:,clusterId] += mat[:, spndj]
## lets check the frequency of clusters
cluster_freq = np.bincount(clusters,minlength=(total_clusters))
## We need to divide the sum by frequency
for i in xrange(total_clusters):
cluster_means[:,i] = cluster_means[:,i]/cluster_freq[i]
return cluster_means
[docs]def separateClusters(kcluster, labels=[]):
"""
Returns a list of lists containing separated clusters i.e. SPND's
that belong to same cluster are put together in a list. As many
list's as there are clusters.
Parameters
----------
kcluster : list
Takes a 1-D cluster list representing {SPND <--> Cluster} mapping.
Returns
-------
cluster_result : list (of lists)
A list containing members list which contain the integer indexes of
SPNDs belonging to a cluster
Example
-------
>>> from kcTools import separateClusters
>>> clusters = [ 2, 0, 1, 3, 2, 0, 1, 3, 2, 0, 1, 3]
>>> print separateClusters(clusters)
>>> ...
[[1, 5, 9], [2, 6, 10], [0, 4, 8], [3, 7, 11]]
"""
num_of_clusters = max(kcluster) + 1
cluster_result = [] # to hold our data
# Create a container
for index in range( num_of_clusters ):
cluster_result.append([])
# Fill the container
for SPND in range( len( kcluster )):
cluster_result[kcluster[SPND]].append( SPND )
# index = 0
# for clst in cluster_result:
# print ("{}: {}".format(index,clst))
# index += 1
# return None
return cluster_result
[docs]def prettyPrint(kcluster, labels=[]):
"""
Prints human readable format from cluster data.
Parameters
----------
kcluster : list
cluster list representing SPNDs to Cluster mapping.
labels : list, optional
list containing string names of SPNDs. If no list is passed automatic
numbering is used.
Returns
-------
Displays output on stdout.
Example
-------
"""
num_of_clusters = max(kcluster) + 1
if not labels:
labels = [str(i) for i in range(len(kcluster))]
if DEBUG:
print "B:", len(labels),len(kcluster)
if not (len( kcluster ) == len( labels )):
print("Mis-match in number of SPNDs and label count. Exiting!")
exit()
cluster_result = [] # to hold our data
for index in range( num_of_clusters ):
cluster_result.append([])
for idx in range( len( kcluster )):
cluster_result[kcluster[idx]].append( labels[idx] )
index = 0
for clst in cluster_result:
print ("{}: {}".format(index,clst))
index += 1
return None
[docs]def getInterClusterS(mat, clusters):
'''
Calculates the sum of intercluster distances.
Parameters
----------
mat : numpy.ndarrray
SPND data used for clustering.
clusters : list
cluster map of SPND.
Returns
-------
interClusterDistance : float
Sum of cluster means from global mean.
Example
-------
'''
Tn, SPNDn = mat.shape
if DEBUG:
print("{} records, {} sensors".format(Tn, SPNDn))
# Get individual channel means
global_mean_vector = np.zeros(shape=(Tn,1))
for Ti in range(Tn):
global_mean_vector[Ti,0]=sum(mat[Ti,:])/SPNDn
# Get cluster means !!! Q: What If a cluster is empty?
# A: It seems the kcluster function doesn't
# produce an empty cluster.
total_clusters = np.max(clusters) + 1
if DEBUG:
print ("{} clusters found".format(total_clusters))
# cluster_means = np.zeros(shape=(Tn,total_clusters))
#list_cluster = list(clusters)
#print("clusters: {}".format(clusters))
#print("clusters_list: {}".format(list_cluster))
##**********************************************
# Cluster mean calculation
# ------------------------
# We already have the means of each spnd in means_vector
# Now we will combine those spnd means which belong to same cluster
#
##**********************************************
# index = 0 # index is basically the current spnd_number
# for element in clusters: # element tells us cluster number
# cluster_means[:,element] += mat[:, index]
# index += 1
r,c = mat.shape
# for spndj in range(c):
# clusterId = clusters[spndj] # element tells us cluster-id
# cluster_means[:,clusterId] += mat[:, spndj]
# ## lets check the frequency of clusters
cluster_freq = np.bincount(clusters,minlength=(total_clusters))
# if DEBUG:
# print cluster_freq
# print np.sum(cluster_freq) # should be equal to spnd count
# ## We need to divide the summed
# for i in xrange(total_clusters):
# cluster_means[:,i] = cluster_means[:,i]/cluster_freq[i]
# if DEBUG:
# print("Cluster centroids:{}".format( cluster_means ))
#cluster_means = getCentroids(mat, clusters) # Manual Method
cluster_means, mm = Pycluster.clustercentroids( mat, None, clusters, 'a',1)
# Now we have individual cluster means
# We can calculate the cluster distances from global centroid
cluster_distances = np.zeros(shape=(1,total_clusters))
for i in xrange(total_clusters):
#cluster_distances[0,i] = 1-abs(np.corrcoef(cluster_means[:,i],global_mean_vector))
#print "debug step"
#print cluster_means[:,i].shape
#print global_mean_vector[:,0].shape
corrMat = abs(np.corrcoef(cluster_means[:,i],global_mean_vector[:,0]))
if DEBUG:
print("Correlation MAT 0:\r\n{}".format(corrMat))
corrMat = 1 - corrMat
#print("Correlation MAT 1:\r\n{}".format(corrMat))
cluster_distances[0,i] = corrMat[0,1]
# scale the distances by number of SPND's in each cluster
for i in xrange(total_clusters):
cluster_distances[0,i] = cluster_distances[0,i] * cluster_freq[i]
#exit()
#print np.sum(cluster_distances)
return np.sum(cluster_distances)
[docs]def count_nan(arr):
"""
Counts occurances of numpy.nan in the passed data structure.
Parameters
----------
arr : numpy.ndarray, list
The dataset in which nan(s) is to be counted.
Returns
-------
count : int
number of occurances of nan within the data passed.
Example
-------
"""
mask = np.ones_like(arr)
mask[np.isnan(arr)]=0
return (mask==0).sum()
[docs]def custom_filter(mat):
"""
Converts negative elements to 0 (zero) in the passed data.
Parameters
----------
mat : numpy.ndarray
matrix to be cleaned of -ve values
Returns
-------
cleanMat : numpy.ndarray
matrix with negative elements converted to zero
"""
mask = np.ones_like(mat)
mat = np.nan_to_num(mat)
mask[ mat < 0 ]=0
# if DEBUG:
# print mask
return np.multiply(mat,mask) #element wise MULTIPLY
[docs]def removeFaultySensors(matSensor, labelSensor, minSensorOutput=0, allowedFault=25):
"""
Eliminates those SPNDs which have x%% bad sensor readings, x is provided by user.
Parameters
----------
matSensor : numpy.ndarray
Data to be filtered.
labelSensor : list
list of SPND labels.
minSensorOutput : float
sensor faulty value(output) or `bad` value threshold. If a sensor
value is below this value then it is considered a `bad` value.
allowedFault : float
%% threshold for total bad values. If a SPND has these many (or
more) %% of `bad` readings then it is eliminated. Corresponding
label is also removed from the list of labels
Returns
-------
matSensor : numpy.ndarray
SPND data after removval of faulty SPND column(s).
labelSensor
label list after removal of faulty SPND label(s).
"""
r,c = matSensor.shape
total_samples = float(r)
faultCount=0
maskFaultOne = [1 for i in range(c)]
for i in range(c): #For each Sensor>>>
faultCount=0
sliceArr = matSensor[:,i]
faultCount = (sliceArr <= minSensorOutput).sum() #How many faults
if faultCount >= (allowedFault*total_samples)/100:
print("Faulty Channel:{} %% fault is {}".format(labelSensor[i],(faultCount*100)/total_samples))
maskFaultOne[i]=0 # Marked the Sensor on mask
if (np.array(maskFaultOne) == 0).sum(): # If mask indicates faulty sensors
boolFMask = np.array(maskFaultOne, dtype=bool)
rr,cc = matSensor.shape
matSensor = matSensor[:,boolFMask] #Cleaning
rrr,ccc = matSensor.shape
## We also need to delete the labels corresponding to faulty SPNDs
lbl = np.array(labelSensor)
labelSensor = lbl[boolFMask]
print("{} faulty channels in Sensor data".format(cc-ccc))
else:
print "No faulty channels in Sensor Data"
return matSensor, labelSensor
[docs]def getOptimalCluster(sensor_mat, Si_threshold=0.1, Kmax=8, **kwargs):
"""
Does clustering multiple times and tries to find the optimal cluster.
Parameters
----------
sensor_mat : numpy.ndarray
SPND data matrix.
Si_threshold : int
Desired ratio of Intra to Inter Cluster distance. This value is a
measure of `closeness` of the similar SPNDs in a cluster.
Kmax : int
Upper limit on number of clusters. The clustering program starts
clustering from k=2 (two clusters) and then keeps increases k to
find the smaller Si.
Returns
-------
kcluster : list
A list containing the cluster mapping of SPNDs. The index corresponds
to SPND and value corresponds to Cluster.
Eg ``kcluster = [2,0,1]`` signifies
| :math:`0^{th} spnd :\\to cluster 2`
| :math:`1^{st} spnd :\\to cluster 0`
| :math:`2^{nd} spnd :\\to cluster 1`
error : float
Represents the sum of intra cluster distances.
freq : int
Represents how many times the optimal solution was found while clustering.
Example
-------
>>> from pylab import randn
>>> import numpy
>>> from kcTools import getoptimalCluster, prettyPrint
>>> #Lets create a random matrix
>>> a = numpy.array(randn(100)).reshape(10,10)
>>> #perform clustering
>>> kc, er, fc = getoptimalCluster(a,0.5,5)
>>> prettyPrint(kc,['a','b','c','d','e','f','g','h','i','j'])
0: ['d', 'h']
1: ['a', 'g']
2: ['b', 'f', 'i']
3: ['c', 'e', 'j']
...
"""
if 'npass' in kwargs:
npass = kwargs['npass']
else:
npass = 100
S_w = 0
S_b = 0
print "\r\n",datetime.datetime.now(),"npass = ",npass,"\r\n"
for k in range(2,Kmax):
kcluster, error, freq = Pycluster.kcluster(sensor_mat,k,None,None,1,npass,'a','a',None)
S_w = error
S_b = getInterClusterS(sensor_mat, kcluster)
### Can S_b be zero ?
S_i = (S_w / S_b)
print("for k:{}, Si:{}, Emin:{}".format(k,S_i,error))
if S_i <= Si_threshold:
break
return kcluster, error, freq
[docs]def loadPcaData(**kwargs): # wrapper
"""
Returns relMatrix and residueCovMat.
PCA model data.
This function searches for the stored data in current directory or some other directory
(based on `cpath` arguments)
Parameters
----------
fname : str, optional
String Name of cluster to load from current dir
cpath : str, optional
System path of directory where cluster is stored.
Returns
-------
relMatrix: ndarray
PCA relations Matrix
residueCovMat : ndarray
Data-Covariance matrix
.. note: order of arguments is to be tracked carefully.
"""
global DEBUG
if 'fname' in kwargs:
fn = kwargs['fname']
else:
kwargs['pca']=1
fn = getFreshCluster(**kwargs)
if DEBUG and fn:
print ("Most recent file:{}".format(fn))
if not fn:
return None
with np.load(fn) as data:
res = data['kc']
# print res
if not (len(res) == 5):
print "Stored cluster data is not valid. [old cluster]"
exit()
return res[0], res[1]#, res[2], res[3], res[4]
[docs]def savePcaData( relMat, residueCovMat, **kwargs):
"""
Saves a given cluster to memory.
Parameters
----------
relMatrix: ndarray
PCA relations Matrix
residueCovMat : ndarray
Data-Covariance matrix
Returns
-------
name : str
Returns ``<filename>.np`` if successfully written the data to disk/dir.
...
"""
#saves cluster to memory
global DEBUG
if 'debug' in kwargs:
DEBUG = kwargs['debug']
else:
DEBUG = False
if 'cpath' in kwargs:
cpath = kwargs['cpath']
else:
cpath = os.getcwd()
if DEBUG:
print("WARNING: No path provided! \nCluster will be stored in current dir:\n {}".format(cpath))
status = os.path.isdir(cpath)
if not status:
print ("ERROR: {} : directory doesn't exist! [{}.py]".format(cpath, __name__))
exit()
if 'fname' in kwargs:
fname = kwargs['fname']
else:
fname=None
if fname:
fn = cpath + fname
else:
if DEBUG:
print("WARNING: No filename provided! Using timestamp as name.")
n = strftime("%Y-%m-%d-%H-%M-%S")
fn = 'pca-{}.np'.format(n)
os.chdir(cpath)
#here we add three data pieces to a single array
temp = [clusterMap, CoVcombinedMat, spndLabels, spndMeans, spndVars]
finalData = np.array(temp)
with open( fn ,'w+') as f:
np.savez(f, kc=finalData)
return fn
[docs]def loadKCFromDisk(**kwargs): # wrapper
"""
Returns `clusterMap`, `CoVcombinedMat`, `spndLabels`, `spndMeans`, `spndVars` from disk (saved data).
This function searches for the stored data in current directory or some other directory
(based on `cpath` arguments)
Parameters
----------
fname : str, optional
String Name of cluster to load from current dir
cpath : str, optional
System path of directory where cluster is stored.
Returns
-------
clusterMap : list
If a cluster is located in the given path( or current directory)
else ``None`` is returned.
CoVcombinedMat : numpy.ndarray
matrix stored in memory
spndLabels: list
List of string names for SPNDs
spndMeans: ndarray
1-D numpy array containing Mean of each SPND sensor.
spndVars: ndarray
1-D numpy array containing Mean of each SPND sensor.
Example
-------
Here's a use case of this function ::
>>> from kcTools import *
>>> from pylab import randn
>>> import numpy as np
>>> mat = np.array(randn(1500)).reshape(150,10)
>>> kc = np.array(range(6))
>>> labels = ['a','b','c','d','e','f','g','h','i', 'j']
>>> means = [np.mean(mat[:,i] for i in range(10))]
>>> vars = [np.var(mat[:,i] for i in range(10))]
>>> saveKCToDisk(kc, mat, labels, means, vars)
>>> clusterMap, CoVcombinedMat, spndLabels, spndMeans, spndVars = loadKCFromDisk()
>>> print( "kc: {}\\r\\n\\r\\nmat:{} \\r\\n\\r\\nlbl:{} \\r\\n\\r\\nmeans:{} \\r\\n\\r\\nvars:{}".format(clusterMap, CoVcombinedMat, spndLabels, spndMeans, spndVars) )
.. note: order of arguments is to be tracked carefully.
"""
global DEBUG
if 'fname' in kwargs:
fn = kwargs['fname']
else:
fn = getFreshCluster(**kwargs)
if DEBUG and fn:
print ("Most recent file:{}".format(fn))
if not fn:
return None
with np.load(fn) as data:
res = data['kc']
# print res
if not (len(res) == 5):
print "Stored cluster data is not valid. [old cluster]"
exit()
return res[0], res[1], res[2], res[3], res[4]
[docs]def saveKCToDisk( clusterMap, CoVcombinedMat, spndLabels, spndMeans, spndVars, **kwargs):
"""
Saves a given cluster to memory.
Parameters
----------
clusterMap: list
kcluster information to be saved to disk
CoVcombinedMat: ndarray
data used for clustering (mean centered and scaled)
spndLabels: list
String names for SPNDs
spndMeans: ndarray
1-D numpy array containing Mean of each SPND sensor.
spndVars: ndarray
1-D numpy array containing Mean of each SPND sensor.
fname: str, optional
Name of data file to be created and stored on dir.
cpath : str, optional
Valid system path where the cluster must be stored.
Returns
-------
name : str
Returns ``<filename>.np`` if successfully written the data to disk/dir.
...
"""
#saves cluster to memory
global DEBUG
if 'debug' in kwargs:
DEBUG = kwargs['debug']
else:
DEBUG = False
if 'cpath' in kwargs:
cpath = kwargs['cpath']
else:
cpath = os.getcwd()
if DEBUG:
print("WARNING: No path provided! \nCluster will be stored in current dir:\n {}".format(cpath))
status = os.path.isdir(cpath)
if not status:
print ("ERROR: {} : directory doesn't exist! [{}.py]".format(cpath, __name__))
exit()
if 'fname' in kwargs:
fname = kwargs['fname']
else:
fname=None
if fname:
fn = cpath + fname
else:
if DEBUG:
print("WARNING: No filename provided! Using timestamp as name.")
n = strftime("%Y-%m-%d-%H-%M-%S")
fn = 'cluster-{}.np'.format(n)
os.chdir(cpath)
#here we add three data pieces to a single array
temp = [clusterMap, CoVcombinedMat, spndLabels, spndMeans, spndVars]
finalData = np.array(temp)
with open( fn ,'w+') as f:
np.savez(f, kc=finalData)
return fn
[docs]def getFreshCluster(**kwargs):
"""
Searches for a valid cluster on disk.
.. note:: No need to use this function directly, use :func:`~kClusterLib.kcTools.loadKCFromDisk`
instead
"""
global DEBUG
if 'debug' in kwargs:
DEBUG = kwargs['debug']
else:
DEBUG = False
if 'cpath' in kwargs:
cpath = kwargs['cpath']
print cpath
else:
if DEBUG:
print ("`cpath` not passed. Default directory will be searched for clusters")
cpath = "./"
status = os.path.isdir(cpath)
if not status:
print ("ERROR: {} : directory doesn't exist! [{}.py]".format(cpath, __name__))
exit()
## list_fn is the list of filenames
fl = os.listdir(cpath)
if 'pca' in kwargs:
rex = re.compile('pca-\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-\d{2}')
else:
rex = re.compile('cluster-\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-\d{2}')
fl = filter(lambda x: rex.match(x), fl)
fl_bkp = fl
if not fl:
if DEBUG:
print ("No cluster data found in path:\n{}".format(cpath))
return None
if DEBUG:
print ("{} previously stored cluster(s):".format(len(fl)))
for i in range(len(fl)):
print fl[i]
if len(fl) == 1 : # If there is only one clusterAvailable
return fl[0] # we return the only cluster we found
#Lets initialize our variable from first file's name
ntime = get_datetime(fl[0])
cursor = 0
for i in range(1,len(fl)):
if DEBUG:
print("")
ctime = get_datetime(fl[i])
if ctime > ntime:
ntime = ctime
cursor = i
return fl_bkp[cursor]
[docs]def get_datetime( fstr ):
"""
Pass a valid DATE-Time string to this function and get a
valid ``datetime`` ``object``
Parameters
----------
fstr : str
A valid datetime string
Returns
-------
timestamp : datetime instance
Valid datetime object. This allows us to check the freshness of
files or checking how mush time has elapsed since a perticular
file was created or
"""
tokens = fstr[:-3].split('-')
tok = [int(ss) for ss in tokens[1:]]
return datetime.datetime(tok[0],tok[1],tok[2],tok[3],tok[4],tok[5])
[docs]def halt():
"""
Halts the execution of program until user chooses to proceeed
Parameters
----------
None
Returns
-------
None
"""
while True:
inp = raw_input("Press enter to continue or 'q' to exit: ")
if not inp:
print "Continuing..."
break
if len(inp)==1:
if 'q' in inp:
exit()
break
print("\r\n Please provide a valid input!\r\n")
[docs]def isSingletonCluster(cluster):
'''Check if a singleton cluster exists in the passed 1-D SPND-cluster
mapping array.
Parameters
----------
cluster : list
1-D array of length( total number of SPNDs ) containing the SPND - cluster mapping.
Returns
-------
status : bool
Returns True if one or more singleton cluster labels occur in cluster.
Example
-------
>>> from kcTools import isSingletonCluster
>>> clustr = [2,3,0,1,2,0,3,2,0] # Here 1 is singleton
>>> print (isSingletonCluster(clustr)) # printing
>>> clustr = [2,3,0,2,2,0,3,2,0] # No singleton
>>> print (isSingletonCluster(clustr)) # printing
Produces Following output
.. testoutput::
True
False
'''
cluster_freq = np.bincount(cluster,minlength=(len(cluster)+1))
if 1 in cluster_freq:
return True
# No single ton
return False
[docs]def mergeSingletonCluster( kcluster, mat, **kwargs):
'''
Merges the singleton cluster(if present) to nearest neighbour(S=1-abs(PC)) \
in the 1-D SPND to cluster MAP.
Parameters
----------
kcluster: list
List containing the 1-D SPND to cluster-id Mapping.
mat: ndarray
Matrix like structure containing the SPND sensor values.
Returns
-------
cluster: list
Updated cluster mapping
Example
-------
'''
clusters = list(kcluster)
DEBUG = False
if 'debug' in kwargs:
DEBUG = kwargs['debug']
#check 1
if not isSingletonCluster(clusters):
return clusters
#cid is the id of SPND present in singleton cluster
cid = np.nan # Initially we assign it an invalid value
# We have a compact list that maps `SPNDs to clusterID`.
# changing the above mapping to simple form
# cluster = list containing SPND_IDs of member SPNDs
clusterList = separateClusters(clusters)
# Finding out the SPND_ID of singleton SPND
for c in clusterList:
if len(c)==1:
cid = clusterList.index(c)
# In previous step did we obtain the singleton SPND ID(?)
if np.isnan(cid):
print "No singleton Found! Exiting"
return clusters
print "singleton CLUSTER id:",cid
print "singleton SPND id:",clusterList[cid][-1]
# print "List of Clusters",clusterList
distances = []
for c in range(len(clusterList)):
if c == cid:
distances.append(np.inf) # Intentionally making it high
else:
distance = Pycluster.clusterdistance(mat, None, None, clusterList[c], clusterList[cid], transpose=1, dist='a', method='a')
#print " : ",distance
distances.append(distance)
if DEBUG:
print "Cluster Distances:",distances
smallDist = np.min(distances)
neighbourNearest = distances.index(smallDist)
if DEBUG:
print "Nearest Neighbour Distance", smallDist
print "Nearest Neighbour Cluster", neighbourNearest
# Merging means we insert the lonely SPND to neighbour cluster (which is a list of SPNDS)
singularSPND = clusters.index(cid)
clusters[singularSPND] = neighbourNearest
## Now we need to take care of cid
##
for j in range(cid+1,max(kcluster)+1):
for i in range(len(clusters)):
if clusters[i] == j:
clusters[i] = j-1
## That was looooong
if DEBUG:
print "Merge Complete. Singleton cluster {} had SPND {}, New cluster Id for SPND {} is {}".format(cid,clusterList[cid][-1],clusterList[cid][-1],neighbourNearest)
print "Old cluster map was : {}".format(kcluster)
print "New cluster map is : {}".format(clusters)
return clusters