现在的位置: 首页 > 综合 > 正文

聚类算法(层级聚类,K-Means)

2013年09月14日 ⁄ 综合 ⁄ 共 2873字 ⁄ 字号 评论关闭
#!/usr/bin/python
import sys
sys.path.append('..')
import modules.similarity as sim # http://hi.baidu.com/bithigher/item/6ca6844ac29d0128fb896026  
import random
words = []
blogs = []
data = []
def getData(file = 'blogdata.txt'):
    global words, map
    f = open(file)
    line = f.readline()
    words = line.strip().split('\t')[1:]
    
    for line in f:
        p = line.strip().split('\t')
        blogs.append(p[0])
        data.append([float(x) for x in p[1:]])
    f.close()
    
def hcluster(similarity = sim.pearson):
    global data
    clusters = [bicluster(data[i], id=i) for i in range(len(data))]
    
    currentId = -1 # id for merged bicluster
    dis = {}    #record the distance computed before
    while(len(clusters) > 1):
        closestIndex = (0, 1)
        closestKey = (clusters[0].id, clusters[1].id)
        if closestKey not in dis:
            dis[closestKey] = 1 - similarity(clusters[0].vec, clusters[1].vec)
        closest = dis[closestKey]
        
        for i in range(len(clusters)):
            for j in range(i+1, len(clusters)):
                key = (clusters[i].id, clusters[j].id)
                if key not in dis:
                    dis[key] = 1 - similarity(clusters[i].vec, clusters[j].vec)
                if(dis[key] < closest):
                    closest = dis[key]
                    closestKey = key
                    closestIndex = (i, j)
        
        # generate a new bicluster and delete the closePair
        mergedVec = [(clusters[closestIndex[0]].vec[i] + clusters[closestIndex[1]].vec[i])/2.0 
            for i in range(len(clusters[0].vec))]
        clusters.append(bicluster(mergedVec, 
                                   clusters[closestIndex[0]], 
                                   clusters[closestIndex[1]], 
                                   closest, 
                                   currentId))
        currentId -= 1
        del dis[closestKey]
        
        #ATTENTION!!! remove the later one first
        #ATTENTION!!! because the index will change if remove the former one first
        del clusters[closestIndex[1]]   
        del clusters[closestIndex[0]]
        print closestIndex, closestKey
        
    return clusters[0]
        
def rotateData():
    global data
    newData = []
    for i in range(len(data[0])):
        newData.append([data[j][i] for j in range(len(data))])
    
    data = newData
        
def printCluster(cluster, labels, n=0):
    if cluster == None: return
    for i in range(n):
        print ' ',
    if(cluster.id >= 0):
        print '-',  labels[cluster.id]
    else:
        print '+', cluster.id
        printCluster(cluster.left, labels, n+1)
        printCluster(cluster.right, labels, n+1)
    
class bicluster:
    def __init__(self, vec, left=None, right=None, distance=0, id=None):
        self.vec = vec
        self.left = left
        self.right = right
        self.distance = distance
        self.id = id
        
        
def kmeans(k=5, iterations=100, similarity=sim.pearson):
    global data
    if(len(data) <= k): return data
    
    clusters = [[] for i in range(k)]
    length = len(data[0])
    ranges = [(min(row[i] for row in data), max(row[i] for row in data)) 
                for i in range(length)]
    #random k centers
    centers = [[random.random()*(ranges[i][1]-ranges[i][0])+ranges[i][0] 
                for i in range(length)]
                for j in range(k)]
    
    # in case of falling into endless loop
    while(iterations > 0):
        iterations -= 1
        clusters = [[] for i in range(k)]
        newCenters = [[0 for i in range(length)] for j in range(k)]
        for i in range(len(data)):
            minVal = 1 - similarity(data[i], centers[0])
            minIndex = 0
            for j in range(1, k):
                curMin = 1 - similarity(data[i], centers[j])
                if curMin < minVal:
                    minVal = curMin
                    minIndex = j
                    
            clusters[minIndex].append(i)
            newCenters[minIndex] = [newCenters[minIndex][j] + data[i][j] for j in range(length)]
            
        for i in range(k):
            num = float(len(clusters[i]))
            if(num > 0):
                newCenters[i] = [newCenters[i][j] / num for j in range(length)]
        
        if(newCenters == centers):
            return clusters
        else:
            centers = newCenters

抱歉!评论已关闭.