• birch聚类算法


    参考百度百科http://baike.baidu.com/link?url=LDYen7bEqt8o2l5mUrnZjQk1topFi36-MwLuhjuGf-1z4sQFtFq1xCEe0TCJwYVjGbu0C6cpuVMFIxNglvSnoa

    外加http://www.cnblogs.com/zhangchaoyang/articles/2200800.html

    学习birch聚类最好有B-树的知识

    结合了B-树的特性,birch算法适合于处理大数据。

    原因是:

    (1)CF 结构概括了簇的基本信息,并且是高度压缩的,它存储了小于实际数据点的聚类信息。每个新添加的数据其作为个体消失了,将信息融入的集合簇中

    (2)增量式的学习方法,不用一次将数据全部加载到内存,可以一边添加数据一边进行学习


    下面是我的实现


    // birch-cluster.cpp : 定义控制台应用程序的入口点。
    //
    
    ///************birch-cluster*************///
    ///*******   author Marshall     ********///
    ///*******   2015.9.18           ********///
    ///*******   version 1.0         ********///
    
    
    #include "stdafx.h"
    #include<vector>
    #include<iostream>
    #include<cstdlib>
    #include<time.h>
    
    #define BirchType int
    
    using namespace std;
    
    
    
    vector<BirchType> operator+(vector<BirchType>aa, vector<BirchType>&bb){
    	_ASSERTE(aa.size() == bb.size());
    	for (int i = 0; i < aa.size(); i++)
    		aa[i] += bb[i];
    	return aa;
    }
    
    vector<BirchType> operator*(vector<BirchType>aa, vector<BirchType>&bb){
    	_ASSERTE(aa.size() == bb.size());
    	for (int i = 0; i < aa.size(); i++)
    		aa[i] *= bb[i];
    	return aa;
    }
    
    vector<BirchType> operator-(vector<BirchType>aa, vector<BirchType>&bb){
    	_ASSERTE(aa.size() == bb.size());
    	for (int i = 0; i < aa.size(); i++)
    		aa[i] -= bb[i];
    	return aa;
    }
    
    vector<BirchType> operator*(vector<BirchType>aa, double k){
    
    	for (int i = 0; i < aa.size(); i++)
    		aa[i] = double(aa[i])* k;
    	return aa;
    }
    vector<BirchType> operator*(int k, vector<BirchType>aa){
    
    	for (int i = 0; i < aa.size(); i++)
    		aa[i] *= k;
    	return aa;
    }
    class birch
    {
    public:
    	struct Attribute
    	{
    		unsigned int dim;
    		vector<BirchType>data;
    		Attribute(unsigned int d) :dim(d)
    		{
    			data.resize(dim);
    		}
    	};
    	struct CF
    	{
    		unsigned int N;
    		vector<BirchType> LS;
    		vector<BirchType> SS;
    		CF(unsigned int N,
    			vector<BirchType> LS,
    			vector<BirchType>SS) :N(N), LS(LS), SS(SS){}
    		/*CF(CF& cc){//shallow copy is enough
    			this->N = cc.N;
    			this->LS = cc.LS;
    			this->SS = cc.SS;
    			}*/
    		CF(unsigned int dim){
    			N = 0;
    			LS.resize(dim);
    			SS.resize(dim);
    		};
    		CF(){};
    	};
    
    	struct Leaf;
    	struct MinCluster
    	{
    		CF cf;
    		Leaf*parent;
    		MinCluster()
    		{
    			parent = NULL;
    		}
    		MinCluster(CF cf)
    		{
    			parent = NULL;
    			this->cf = cf;
    		}
    	};
    
    	struct Leaf
    	{
    		Leaf*pre, *next;//to make up a leaf-list.for Nonleaf,NULL
    		Leaf*parent;
    		vector<Leaf*>*child;//对Leaf而言为NULL
    		vector<MinCluster>*cluster;//对NonLeaf而言为NULL
    		CF cf;
    		Leaf()
    		{
    			parent = pre = next = NULL;
    			child = NULL;
    			cluster = NULL;
    		}
    	};
    	void generate_data(int num, int dim, vector<int>&span)
    	{
    		this->dim = dim;
    		_ASSERTE(span.size() == dim);
    		for (int i = 0; i < num; i++)
    		{
    			Attribute att(dim);
    			for (int j = 0; j < dim; j++)
    				att.data[j] = span[j] * double(rand()) / double(RAND_MAX + 1.0);
    			dataset.push_back(att);
    		}
    	}
    	vector<Attribute>dataset;
    
    	int absorbnum;
    
    public:
    	birch(unsigned int b, unsigned int l, unsigned int t)
    		:B(b), L(l), T(t){
    		_ASSERTE(B > 2);
    		_ASSERTE(L > 3);
    		root = NULL;
    		time_t tt;
    		srand(time(&tt));
    		absorbnum = 0;
    	}
    	~birch();
    	void insert(Attribute att);
    
    
    private:
    
    	unsigned int B; //maximal num of child a Nonleaf will have
    	unsigned int L;//maximal num of MinCluster a leaf will haveLeaf
    	unsigned int T;// MinCluster的直径不能超过T
    	Leaf*root;
    	Leaf*head;//the head of the leaf-list at the bottom of the tree
    	int dim;
    
    
    private:
    	inline double lengthofvec(vector<BirchType>&aa){
    		double len = 0;
    		for (int i = 0; i < aa.size(); i++)
    			len += pow(aa[i], 2.0);
    		return sqrt(len);
    	}
    	double sumofvec(vector<BirchType>&aa){
    		double sum = 0;
    		for (int i = 0; i < aa.size(); i++)
    			sum += aa[i];
    		return sum;
    	}
    
    	double cal_inter_cluster_dis(CF &cf1, CF &cf2);
    	double cal_intra_cluster_dis();
    	double merge_cluster_diameter(CF &cf1, CF &cf2);
    	vector<BirchType>updateSS(vector<BirchType>&LS, vector<BirchType>&SS)
    	{
    		for (int i = 0; i < LS.size(); i++)
    			SS[i] += pow(LS[i], 2.0);
    		return SS;
    	}
    	CF updateCF(CF &c1, CF &c2)
    	{
    		return CF(c1.N + c2.N, c1.LS + c2.LS, c1.SS + c2.SS);
    	}
    	void updateCF(Leaf*leaf)
    	{
    		CF cf(dim);
    		if (leaf->cluster != NULL)
    		{
    
    			for (int i = 0; i < leaf->cluster->size(); i++)
    			{
    				cf.N = cf.N + (*leaf->cluster)[i].cf.N;
    				cf.LS = cf.LS + (*leaf->cluster)[i].cf.LS;
    				cf.SS = cf.SS + (*leaf->cluster)[i].cf.SS;
    			}
    		}
    		else if (leaf->child != NULL)
    		{
    			for (int i = 0; i < leaf->child->size(); i++)
    			{
    				cf.N = cf.N + (*leaf->child)[i]->cf.N;
    				cf.LS = cf.LS + (*leaf->child)[i]->cf.LS;
    				cf.SS = cf.SS + (*leaf->child)[i]->cf.SS;
    			}
    		}
    		leaf->cf = cf;
    	}
    
    	MinCluster create_mincluster(Attribute att)
    	{
    		vector<BirchType>aa;
    		aa.resize(att.dim);
    		return MinCluster(CF(1, att.data, updateSS(att.data, aa)));
    	}
    
    	void insert(Leaf*close, bool &split, MinCluster &clu);
    
    
    
    };
    
    birch::~birch()
    {
    	Leaf*plist = head;
    	while (plist != NULL)
    	{
    		delete plist->cluster;
    		plist = plist->next;
    	}
    	vector<Leaf*>aa, bb;
    	aa.push_back(root);
    	while (!aa.empty())
    	{
    		Leaf*pleaf = aa.back();
    		aa.pop_back();
    		bb.push_back(pleaf);
    		if (pleaf->child != NULL)
    			aa.insert(aa.end(), pleaf->child->begin(), pleaf->child->end());
    	}
    	for (int i = 0; i < bb.size(); i++)
    	{
    		if (bb[i]->child != NULL)
    			delete bb[i]->child;
    		delete bb[i];
    	}
    }
    /*double birch::merge_cluster_diameter(CF &cf1, CF &cf2)
    {
    return sqrt(sumofvec(cf1.SS *(1.0 / double(cf1.N))
    + cf2.SS *(1.0 / double(cf1.N)) -
    2 * cf1.LS*cf2.LS*(1.0 / double(cf1.N + cf2.N))));
    }*/
    
    double birch::merge_cluster_diameter(CF &cf1, CF &cf2)
    {
    	return sqrt(sumofvec(cf1.SS *(1.0 / double(cf1.N))
    		+ cf2.SS *(1.0 / double(cf1.N)) -
    		2 * cf1.LS*cf2.LS*(1.0 / double(cf1.N + cf2.N))));
    }
    
    
    void birch::insert(Attribute att)
    {
    	if (root == NULL)
    	{
    		root = new Leaf;
    		root->cluster = new vector < MinCluster > ;
    		(*root->cluster).push_back(create_mincluster(att));
    		root->cf = CF((*root->cluster)[0].cf);
    		head = root;
    		head->pre = NULL;
    		head->next = NULL;
    		return;
    	}
    	MinCluster clu = create_mincluster(att);
    	Leaf*leaf = root;
    
    	vector<int>path;
    
    	while (leaf->cluster == NULL)
    	{
    		int k = -1;
    		double mindis = 10000000000000;
    		double dd;
    		for (int i = 0; i < (*leaf->child).size(); i++)
    		{
    			double dis = cal_inter_cluster_dis(clu.cf, (*leaf->child)[i]->cf);
    			if (dis < mindis)
    			{
    				mindis = dis;
    				k = i;
    			}
    			dd = dis;
    		}
    
    		_ASSERTE(k >= 0);
    		path.push_back(k);
    		leaf = (*leaf->child)[k];
    	}
    
    
    
    	int k = -1;
    	//mindis = 100000;
    	double mindis = 100000;
    	for (int i = 0; i < (*leaf->cluster).size(); i++)
    	{
    		double dis = cal_inter_cluster_dis(clu.cf, (*leaf->cluster)[i].cf);
    		if (dis < mindis)
    		{
    			mindis = dis;
    			k = i;
    		}
    		_ASSERTE(k >= 0);
    	}
    	//double ttt = merge_cluster_diameter(clu.cf, (*leaf->cluster)[k].cf);
    
    	double ttt = cal_inter_cluster_dis(clu.cf, (*leaf->cluster)[k].cf);
    	if (ttt < T)
    	{
    		//absorb
    		(*leaf->cluster)[k].cf = updateCF((*leaf->cluster)[k].cf, clu.cf);
    		absorbnum++;
    	}
    	else
    	{
    		(*leaf->cluster).push_back(clu);
    	}
    	//update CF value along the path
    	Leaf*lea = root;
    	(*lea).cf = updateCF((*lea).cf, clu.cf);
    	for (int i = 0; i < path.size(); i++)
    	{
    		(*lea->child)[path[i]]->cf = updateCF((*lea->child)[path[i]]->cf, clu.cf);
    		lea = (*lea->child)[path[i]];
    	}
    
    	if ((*leaf->cluster).size() > L)
    	{
    		double maxdis = 0;
    		int th1 = -1;
    		int th2 = -1;
    		double**dismatrix = new double*[(*leaf->cluster).size()];
    		for (int i = 0; i < (*leaf->cluster).size(); i++)
    			dismatrix[i] = new double[(*leaf->cluster).size()];
    		//找到距离最远的两个簇
    		for (int i = 0; i < (*leaf->cluster).size() - 1; i++)
    			for (int h = i + 1; h < (*leaf->cluster).size(); h++)
    			{
    				double dis = cal_inter_cluster_dis((*leaf->cluster)[i].cf, (*leaf->cluster)[h].cf);
    				dismatrix[i][h] = dis;
    				dismatrix[h][i] = dis;
    				if (dis > maxdis)
    				{
    					maxdis = dis;
    					th1 = i; th2 = h;
    				}
    			}
    		Leaf*new_leaf = new Leaf;
    		new_leaf->cluster = new vector < MinCluster > ;
    		new_leaf->cluster->push_back((*leaf->cluster)[th2]);
    		int len = (*leaf->cluster).size();
    		(*leaf->cluster)[th2].parent = new_leaf;
    
    		//根据各簇与两个新簇的距离分配到两个新簇中
    		for (int i = 0; i < len; i++)
    		{
    			if (i == th1 || i == th2)
    				continue;
    			if (dismatrix[i][th2] < dismatrix[i][th1])
    			{
    				(*leaf->cluster)[i].parent = new_leaf;
    				new_leaf->cluster->push_back((*leaf->cluster)[i]);
    
    			}
    		}
    		for (int i = 0; i < (*leaf->cluster).size(); i++)
    			delete[] dismatrix[i];
    		delete[]dismatrix;
    
    		vector < MinCluster >::iterator it, it1;
    		it = (*leaf->cluster).begin();
    		while (it != (*leaf->cluster).end())
    		{
    			if (it->parent == new_leaf)
    				it = (*leaf->cluster).erase(it);
    			else
    			{
    				it++;
    			}
    		}
    		//不要忘了更新leaf和new_leaf的cf值
    		updateCF(leaf);
    		updateCF(new_leaf);
    		//不要忘了将new_leaf加入到链表中
    		Leaf*next = leaf->next;
    		leaf->next = new_leaf;
    		new_leaf->pre = leaf;
    		new_leaf->next = next;
    		if (next)
    			next->pre = new_leaf;
    		if (leaf->parent != NULL)
    		{
    			leaf->parent->child->push_back(new_leaf);
    			new_leaf->parent = leaf->parent;
    		}
    		else//leaf is root,then a new root should be created
    		{
    			Leaf*new_root = new Leaf;
    			new_root->child = new vector < Leaf* > ;
    			new_root->child->push_back(leaf);
    			new_root->child->push_back(new_leaf);
    			leaf->parent = new_root;
    			new_leaf->parent = new_root;
    			updateCF(new_root);
    			root = new_root;
    			return;
    		}
    	}
    	Leaf*cur = leaf->parent;
    	while (cur != NULL&&cur->child->size() > B)
    	{
    		double maxdis = 0;
    		int th1 = -1;
    		int th2 = -1;
    		double**dismatrix = new double*[cur->child->size()];
    		for (int i = 0; i < cur->child->size(); i++)
    			dismatrix[i] = new double[cur->child->size()];
    		//找到距离最远的两个leaf
    		for (int i = 0; i < cur->child->size() - 1; i++)
    			for (int h = i + 1; h < cur->child->size(); h++)
    			{
    				double dis = cal_inter_cluster_dis((*cur->child)[i]->cf, (*cur->child)[h]->cf);
    				dismatrix[i][h] = dis;
    				dismatrix[h][i] = dis;
    				if (dis > maxdis)
    				{
    					maxdis = dis;
    					th1 = i; th2 = h;
    				}
    			}
    
    		Leaf*new_leaf1 = new Leaf;
    		new_leaf1->child = new vector < Leaf* > ;
    		(*cur->child)[th2]->parent = new_leaf1;
    		(*new_leaf1->child).push_back((*cur->child)[th2]);
    		int len = (*cur->child).size();
    
    		//rearrange other leaves to th1 th2 as their child
    		for (int i = 0; i < len; i++)
    		{
    			if (i == th1 || i == th2)
    				continue;
    			if (dismatrix[i][th2] < dismatrix[i][th1])
    			{
    				(*cur->child)[i]->parent = new_leaf1;
    				new_leaf1->child->push_back((*cur->child)[i]);
    
    			}
    		}
    		for (int i = 0; i < (*cur->child).size(); i++)
    			delete[] dismatrix[i];
    		delete[]dismatrix;
    
    		vector < Leaf* >::iterator it;
    		it = (*cur->child).begin();
    		while (it != (*cur->child).end())
    		{
    			if ((*it)->parent == new_leaf1)
    				it = (*cur->child).erase(it);
    			else
    				it++;
    		}
    		//不要忘了更新cur和new_leaf1的cf值
    		updateCF(cur);
    		updateCF(new_leaf1);
    
    		//if cur is root,then a new root should be created
    		if (cur->parent == NULL)
    		{
    			Leaf*new_root = new Leaf;
    			new_root->child = new vector < Leaf* > ;
    			new_root->child->push_back(cur);
    			new_root->child->push_back(new_leaf1);
    			cur->parent = new_root;
    			new_leaf1->parent = new_root;
    			updateCF(new_root);
    			root = new_root;
    			return;
    		}
    
    		//cur is not root
    		//不要忘了将new_leaf1加入cur的父亲节点的child
    		cur->parent->child->push_back(new_leaf1);
    		new_leaf1->parent = cur->parent;
    		cur = cur->parent;
    	}
    
    }
    
    
    
    //根据CF值计算簇间距离
    /*double birch::cal_inter_cluster_dis(CF &cf1, CF &cf2)
    {
    return sqrt(sumofvec((2 * (cf1.N + cf2.N)*(cf1.SS + cf2.SS)
    - 2 * (cf1.LS + cf2.LS)*(cf1.LS + cf2.LS))*
    (1.0 / double(cf1.N + cf2.N)*(cf1.N + cf2.N - 1))));
    }*/
    
    double birch::cal_inter_cluster_dis(CF &cf1, CF &cf2)
    {
    	double dis = 0;
    	double temp;
    	for (int i = 0; i < dim; i++)
    	{
    		double t1 = double(cf1.LS[i]) / double(cf1.N);
    		double t2 = double(cf2.LS[i]) / double(cf2.N);
    		temp = t1 - t2;
    		dis += temp*temp;
    	}
    
    	return sqrt(dis);
    }
    
    
    
    
    int _tmain(int argc, _TCHAR* argv[])
    {
    	//vector<int*>aa, bb;
    	//int *p1 = new int;
    	//int *p2 = new int;
    	//int *p3 = new int;
    	//*p1 = 8;
    	//*p2 = 9;
    	//*p3 = 88;
    	//aa.push_back(p1);
    	//aa.push_back(p2);
    	//aa.push_back(p3);
    	//*aa[2] = 999;
    	//bb.push_back(p3);
    	//vector<int*>::iterator it = aa.begin() + 1;
    	////delete aa[0];
    	//it = aa.erase(it);
    
    	//cout << *bb[0] << endl;
    	//cout << **it << endl;
    	//for (it = aa.begin(); it != aa.end(); it++)
    	//	cout << **it << endl;
    
    	birch bir(5, 6, 20);
    	int dim = 2;
    	int num = 1000;
    	vector<int>span;
    	for (int i = 0; i < dim; i++)
    		span.push_back(1000);
    	bir.generate_data(num, dim, span);
    	for (int i = 0; i < num; i++)
    		bir.insert(bir.dataset[i]);
    	cout << bir.absorbnum << endl;
    	system("pause");
    	return 0;
    }
    


    版权声明:

  • 相关阅读:
    Kafka调试入门(一)
    java笔记十五——多线程
    java笔记十四——初始集合源码
    java笔记十二——集合总结
    java笔记十一——异常
    java笔记十——大数类和日期类
    java笔记九——Object类与String类
    java笔记八——面向对象(三)
    java笔记七——面向对象(二)
    java笔记六——面向对象(一)
  • 原文地址:https://www.cnblogs.com/walccott/p/4956868.html
Copyright © 2020-2023  润新知