from numpy import * def loadDataSet(fileName): dataMat = [] fr = open(fileName) for line in fr.readlines(): curLine = line.strip().split('\t') fltLine = map(float,curLine) dataMat.append(fltLine) return dataMat def distEclude(vecA, vecB): return sqrt(sum(pow((vecA-vecB).A,2))) def randCent(dataSet, k): n = shape(dataSet)[1] centroids = mat(zeros((k,n))) for j in range(n): print dataSet[:,j] minJ = min(dataSet[:,j]) rangeJ = float(max(dataSet[:,j])-minJ) centroids[:,j] = minJ + rangeJ * random.rand(k,1) return centroids def kMeans(dataSet, k, distMeas = distEclude, createCent=randCent): m = shape(dataSet)[0] clusterAssment = mat(zeros((m,2))) centroids = createCent(dataSet,k) clusterChanged = True while clusterChanged: for i in range(m): minDist = inf; minIndex = -1 for j in range(k): distJI = distMeas(centroids[j,:],dataSet[i,:]) if distJI < minDist: minDist = distJI minIndex = j clusterChanged = True if clusterAssment[i,0] != minIndex else False clusterAssment[i,:] = minIndex, minDist**2 print centroids for cent in range(k): ptsInClust = dataSet[nonzero(clusterAssment[:,0].A == cent)[0]] #get datas which belong to cent centroids[cent,:] = mean(ptsInClust, axis = 0) #update the centroids return centroids, clusterAssment # dataSet = loadDataSet('testSet.txt') # kMeans(mat(dataSet), 3, distEclude, randCent) def biKmeans(dataSet, k, distMeas = distEclude): m = shape(dataSet)[0] clusterAssment = mat(zeros((m,2))) centroid0 = mean(dataSet, axis=0).tolist()[0] centList = [centroid0] for j in range(m): clusterAssment[j,1] = distMeas(mat(centroid0), dataSet[j,:])**2 while(len(centList) < k): lowestSSE = inf for i in range(len(centList)): ptsInCurrCluster = dataSet[nonzero(clusterAssment[:,0].A == i)[0],:] centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas) sseSplit = sum(splitClustAss[:,1]) sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A != i)[0],1]) print "sseSplit, and notSplit:", sseSplit, sseNotSplit if (sseSplit + sseNotSplit) < lowestSSE: bestCentToSplit = i bestNewCents = centroidMat bestClustAss = splitClustAss.copy() lowestSSE = sseSplit + sseNotSplit '''after partition,the bestCentToSplit will replace the original cluster''' bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList) bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit print 'the bestCenttoSplit is:', bestCentToSplit print 'the len of bestCustAss is:', len(bestClustAss) centList[bestCentToSplit] = bestNewCents[0,:] centList.append(bestNewCents[1,:]) '''update the clusterAssment''' clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:] = bestClustAss return centList, clusterAssment # datMat = mat(loadDataSet('testSet2.txt')) # centList, myNewAssments = biKmeans(datMat, 3) # print centList