KJB
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
kmeans.h
Go to the documentation of this file.
1 // kmeans.h - K-Means clustering class
3 // Author: Doron Tal
4 // Date created: March, 2000
5 
6 #ifndef _KMEANS_H
7 #define _KMEANS_H
8 
9 #include "wrap_dtlib_cpp/img.h"
10 
11 /*
12  * Kobus: We have run into trouble with 32 bit centric code in this
13  * distribution. I have changed some long's to kjb_int32's. The immediate
14  * problem is that the segmentation maps can get written out as 64 bit integers.
15 */
16 #include "l/l_sys_def.h"
17 
18 #warning "[Code police] Do not put 'using namespace' in global scope of header."
19 using namespace std;
20 #warning "[Code police] Do not put 'using namespace' in global scope of header."
21 using namespace kjb_c;
22 
23 
24 namespace DTLib {
25 
26  // K-Means class - vector quantization on data in Euclidean space
27  // --------------------------------------------------------------
28  //
29  // Usage example:
30  // CKMeans ck(P, K, D, ppData);
31  // ck.Init();
32  // ck.Iterate(20);
33  // ck.Prune(1.1f);
34  // ... and the answer is in ck.pPtClusters()
35  //
36  class CKMeans
37  {
38  public:
39  // Constructor doesn't initialize, just allocates memory and
40  // sets variables. To initialize, use one of the Init
41  // functions.
42  // 'P' - number of data points
43  // 'K' - number of clusters
44  // 'D' - number of dimensions
45  // 'ppData' - a pointer to either:
46  // (1) D arrays of length P each, such that
47  // ppDimsPts[d][p] is the Value of the
48  // d_th dimension of the p_th point. If this is
49  // the data format, then we need to make
50  // the next argument, 'bTransposeData' true.
51  // (2) P arrays of length D each, in which case,
52  // we must make 'bTransposeData' false.
53  // 'bTransposeData' - this arg. describes how the data is supplied
54  // in 'ppData'
55  // NOTE: if 'bTransposeData' is true, the constructor makes a
56  // local copy of the data that gets destroyed only when this
57  // kmeans object itself gets destroyed.
58  CKMeans(const int P, const int K, const int D, float** ppData,
59  bool bTransposeData = true);
60 
61  ~CKMeans();
62 
63  // Classic initialization: assign each cluster center to a
64  // data point chosen randomly. NOTE: the cluster centers
65  // could end up just about on top of each other, or near a
66  // part of the space that gets very few data points,
67  // i.e. anything can happen with this type of initialization.
68  void Init();
69 
70  // Usama Fayad's refined K-Means initialization procedure
71  void RefinedInit(const int& SubsampleSize, const int& J);
72 
73  // used in Usama Fayad's algorithm
74  void InitWithMeanVectors(float** ppMeanVectors);
75 
76  // "safer" Iterate(), for Usama Fayad's RefinedInit() above
77  void IterateMod();
78 
79  // Cluster a la K-Means. Termination: function terminates
80  // when either of two conditions are met: (a) terminate after
81  // fewer than 'nChanges' points change clusters; (b) terminate
82  // after 'nIterations' iterations. The resulting cluster
83  // assignment to each point is left in m_pClusterAssignments;
84  // RETURN VALUE: if the algorithm converged, returns the # of
85  // iterations; otherwise if 'nMaxIters' was reached, returns
86  // -1; otheriwse if 'nMaxChanges' was reached, returns -2.
87  int Iterate(const int& nMaxIters = -1, const int& nMaxChanges = -1,
88  const int& SmallestClusterSize = -1,
89  const int& Width = -1, const int& Height = -1);
90 
91  // at the start of Prune(), we compute the initial
92  // distortion, e*, we stop when the distortion becomes
93  // (StoppingFactor)*(e*).
94  void Prune(const float& StoppingFactor,
95  const int& StoppingK = -1);
96 
98  // DATA ACCESS FUNCTIONS
100 
101  int nK() { return m_nK; }
102 
103  // THIS IS THE OUTPUT OF K-MEANS
104  // assignments of clusters to the p data points, ZERO-INDEXED
105  // (i.e. first cluster center is indexed by 0, next index is 1, ..)
106  kjb_int32* pPtClusters() { return m_pPtClusters; }
107 
108  // as above, but detaches buffer from object (to avoid copying)
109  // NB: you must delete this pointer yourself eventually!
110  kjb_int32* pPtClustersDetach() {
111  kjb_int32* pRes = m_pPtClusters; m_pPtClusters = NULL; return pRes;
112  }
113 
114  private:
115  inline void SwapParents(const int& p, const int& iNearestK,
116  const int& iParentK);
117 
118  void CleanupMemberships2D(const int& Width, const int& Height);
119 
120  // (re)assigns a cluster to each data point by picking the closest one
121  // also recomputes the the number of points assigned to each cluster
122  // and the sum of all the points of each cluster
123  void AssignClustersToPoints();
124 
125  // Compute the mean for each cluster, using m_ppClusterDimSums
126  // and m_pClusterNumpts
127  void ComputeClusterMeans();
128 
129  void RemoveSmallClusters(const int& SmallestSizeAllowed = 1);
130 
131  // returns index of cluster nearest to data point indexed by 'p'
132  inline int FindNearestClusterToPoint(const int p);
133 
134  // 'k' - index of cluster center
135  // 'p' - index of data point
136  // RETURN VALUE: the squared distance between data point 'p'
137  // and cluster center 'k'
138  inline float ComputeSquareDistance(const int k, const int p);
139 
140  // return the total distortion, or squared error
141  float ComputeTotalError();
142 
144 
145  // true # of clusters currently used (can change after a call to
146  // Prune(), or to Iterate())
147  int m_nK;
148 
149  // original # of clusters class object was constructed with
150  int m_nOrigK;
151 
152  int m_nChanges;
153 
154  // # of data points (# of Img points)
155  int m_nP;
156 
157  // # of dimensions of each data point
158  int m_nD;
159 
160  // The data points. PxD matrix - P pointers to arrays of D floats each.
161  float** m_ppPtsDims;
162 
163  // if true, delete m_ppPtsDims, otherwise don't delete it
164  // (if we supplied the transposed data in the constructor then we
165  // haven't allocated the data internally and should not deallocate it,
166  // otherwise, we'll allocate it). This member var is used internally
167  // only.
168  bool m_bDeallocatePtsDims;
169 
170  // THIS IS THE OUTPUT OF K-MEANS
171  // the assignment of clusters to points (size = P)
172  kjb_int32* m_pPtClusters;
173 
174  // Array of K integers, each = # of points per cluster.
175 
176  int* m_pClusterNumpts;
177 
178  // KxD matrix - K pointers to arrays of D points each.
179  // each of these arrays corresponds to a cluster's mean
180  // but the mean is unnormalized (i.e. it's just the sum, not
181  // divided by the number of elements).
182  float** m_ppClusterDimSums;
183 
184  // KxD matrix - K pointers to arrays of D points each.
185  // each of these arrays corresponds to a cluster's mean.
186  float** m_ppClusterDimMeans;
187 
188  // temporary storage area used to avoid memory reallocation when
189  // removing clusters
190  int* m_pClusterNumpts2;
191  float** m_ppClusterDimSums2;
192  float** m_ppClusterDimMeans2;
193  };
194 
195 } // namespace DTLib {
196 
197 #endif /* #ifndef _KMEANS_H */
for k
Definition: APPgetLargeConnectedEdges.m:61
kjb_int32 * pPtClustersDetach()
Definition: kmeans.h:110
kjb_int32 * pPtClusters()
Definition: kmeans.h:106
int nK()
Definition: kmeans.h:101
D
Definition: APPgetLargeConnectedEdges.m:106
Definition: kmeans.h:36