QGIS API Documentation 3.43.0-Master (6c62b930b02)
qgsalgorithmkmeansclustering.cpp
Go to the documentation of this file.
1/***************************************************************************
2 qgsalgorithmkmeansclustering.cpp
3 ---------------------
4 begin : June 2018
5 copyright : (C) 2018 by Nyall Dawson
6 email : nyall dot dawson at gmail dot com
7 ***************************************************************************/
8
9/***************************************************************************
10 * *
11 * This program is free software; you can redistribute it and/or modify *
12 * it under the terms of the GNU General Public License as published by *
13 * the Free Software Foundation; either version 2 of the License, or *
14 * (at your option) any later version. *
15 * *
16 ***************************************************************************/
17
19#include <unordered_map>
20#include <random>
21
23
24constexpr uint KMEANS_MAX_ITERATIONS = 1000;
25
26QString QgsKMeansClusteringAlgorithm::name() const
27{
28 return QStringLiteral( "kmeansclustering" );
29}
30
31QString QgsKMeansClusteringAlgorithm::displayName() const
32{
33 return QObject::tr( "K-means clustering" );
34}
35
36QStringList QgsKMeansClusteringAlgorithm::tags() const
37{
38 return QObject::tr( "clustering,clusters,kmeans,points" ).split( ',' );
39}
40
41QString QgsKMeansClusteringAlgorithm::group() const
42{
43 return QObject::tr( "Vector analysis" );
44}
45
46QString QgsKMeansClusteringAlgorithm::groupId() const
47{
48 return QStringLiteral( "vectoranalysis" );
49}
50
51void QgsKMeansClusteringAlgorithm::initAlgorithm( const QVariantMap & )
52{
53 addParameter( new QgsProcessingParameterFeatureSource( QStringLiteral( "INPUT" ), QObject::tr( "Input layer" ), QList<int>() << static_cast<int>( Qgis::ProcessingSourceType::VectorAnyGeometry ) ) );
54 addParameter( new QgsProcessingParameterNumber( QStringLiteral( "CLUSTERS" ), QObject::tr( "Number of clusters" ), Qgis::ProcessingNumberParameterType::Integer, 5, false, 1 ) );
55
56 QStringList initializationMethods;
57 initializationMethods << QObject::tr( "Farthest points" )
58 << QObject::tr( "K-means++" );
59 addParameter( new QgsProcessingParameterEnum( QStringLiteral( "METHOD" ), QObject::tr( "Method" ), initializationMethods, false, 0, false ) );
60
61 auto fieldNameParam = std::make_unique<QgsProcessingParameterString>( QStringLiteral( "FIELD_NAME" ), QObject::tr( "Cluster field name" ), QStringLiteral( "CLUSTER_ID" ) );
62 fieldNameParam->setFlags( fieldNameParam->flags() | Qgis::ProcessingParameterFlag::Advanced );
63 addParameter( fieldNameParam.release() );
64 auto sizeFieldNameParam = std::make_unique<QgsProcessingParameterString>( QStringLiteral( "SIZE_FIELD_NAME" ), QObject::tr( "Cluster size field name" ), QStringLiteral( "CLUSTER_SIZE" ) );
65 sizeFieldNameParam->setFlags( sizeFieldNameParam->flags() | Qgis::ProcessingParameterFlag::Advanced );
66 addParameter( sizeFieldNameParam.release() );
67
68 addParameter( new QgsProcessingParameterFeatureSink( QStringLiteral( "OUTPUT" ), QObject::tr( "Clusters" ), Qgis::ProcessingSourceType::VectorAnyGeometry ) );
69}
70
71QString QgsKMeansClusteringAlgorithm::shortHelpString() const
72{
73 return QObject::tr( "This algorithm calculates the 2D distance based k-means cluster number for each input feature.\n\n"
74 "If input geometries are lines or polygons, the clustering is based on the centroid of the feature.\n\n"
75 "References:\n"
76 "Arthur, David & Vassilvitskii, Sergei. (2007). K-Means++: The Advantages of Careful Seeding. Proc. of the Annu. ACM-SIAM Symp. on Discrete Algorithms. 8.\n\n"
77 "Bhattacharya, Anup & Eube, Jan & Röglin, Heiko & Schmidt, Melanie. (2019). Noisy, Greedy and Not So Greedy k-means++" );
78}
79
80QString QgsKMeansClusteringAlgorithm::shortDescription() const
81{
82 return QObject::tr( "Calculates the 2D distance based k-means cluster number for each input feature." );
83}
84
85QgsKMeansClusteringAlgorithm *QgsKMeansClusteringAlgorithm::createInstance() const
86{
87 return new QgsKMeansClusteringAlgorithm();
88}
89
90QVariantMap QgsKMeansClusteringAlgorithm::processAlgorithm( const QVariantMap &parameters, QgsProcessingContext &context, QgsProcessingFeedback *feedback )
91{
92 std::unique_ptr<QgsProcessingFeatureSource> source( parameterAsSource( parameters, QStringLiteral( "INPUT" ), context ) );
93 if ( !source )
94 throw QgsProcessingException( invalidSourceError( parameters, QStringLiteral( "INPUT" ) ) );
95
96 int k = parameterAsInt( parameters, QStringLiteral( "CLUSTERS" ), context );
97 int initializationMethod = parameterAsInt( parameters, QStringLiteral( "METHOD" ), context );
98
99 QgsFields outputFields = source->fields();
100 QgsFields newFields;
101 const QString clusterFieldName = parameterAsString( parameters, QStringLiteral( "FIELD_NAME" ), context );
102 newFields.append( QgsField( clusterFieldName, QMetaType::Type::Int ) );
103 const QString clusterSizeFieldName = parameterAsString( parameters, QStringLiteral( "SIZE_FIELD_NAME" ), context );
104 newFields.append( QgsField( clusterSizeFieldName, QMetaType::Type::Int ) );
105 outputFields = QgsProcessingUtils::combineFields( outputFields, newFields );
106
107 QString dest;
108 std::unique_ptr<QgsFeatureSink> sink( parameterAsSink( parameters, QStringLiteral( "OUTPUT" ), context, dest, outputFields, source->wkbType(), source->sourceCrs() ) );
109 if ( !sink )
110 throw QgsProcessingException( invalidSinkError( parameters, QStringLiteral( "OUTPUT" ) ) );
111
112 // build list of point inputs - if it's already a point, use that. If not, take the centroid.
113 feedback->pushInfo( QObject::tr( "Collecting input points" ) );
114 const double step = source->featureCount() > 0 ? 50.0 / static_cast< double >( source->featureCount() ) : 1;
115 int i = 0;
116 int n = 0;
117 int featureWithGeometryCount = 0;
118 QgsFeature feat;
119
120 std::vector<Feature> clusterFeatures;
121 QgsFeatureIterator features = source->getFeatures( QgsFeatureRequest().setNoAttributes() );
122 QHash<QgsFeatureId, std::size_t> idToObj;
123 while ( features.nextFeature( feat ) )
124 {
125 i++;
126 if ( feedback->isCanceled() )
127 {
128 break;
129 }
130
131 feedback->setProgress( i * step );
132 if ( !feat.hasGeometry() )
133 continue;
134 featureWithGeometryCount++;
135
136 QgsPointXY point;
138 point = QgsPointXY( *qgsgeometry_cast<const QgsPoint *>( feat.geometry().constGet() ) );
139 else
140 {
141 const QgsGeometry centroid = feat.geometry().centroid();
142 if ( centroid.isNull() )
143 continue; // centroid failed, e.g. empty linestring
144
145 point = QgsPointXY( *qgsgeometry_cast<const QgsPoint *>( centroid.constGet() ) );
146 }
147
148 n++;
149
150 idToObj[feat.id()] = clusterFeatures.size();
151 clusterFeatures.emplace_back( Feature( point ) );
152 }
153
154 if ( n < k )
155 {
156 feedback->reportError( QObject::tr( "Number of geometries is less than the number of clusters requested, not all clusters will get data" ) );
157 k = n;
158 }
159
160 if ( k > 1 )
161 {
162 feedback->pushInfo( QObject::tr( "Calculating clusters" ) );
163
164 // cluster centers
165 std::vector<QgsPointXY> centers( k );
166 switch ( initializationMethod )
167 {
168 case 0: // farthest points
169 initClustersFarthestPoints( clusterFeatures, centers, k, feedback );
170 break;
171 case 1: // k-means++
172 initClustersPlusPlus( clusterFeatures, centers, k, feedback );
173 break;
174 default:
175 break;
176 }
177 calculateKMeans( clusterFeatures, centers, k, feedback );
178 }
179
180 // cluster size
181 std::unordered_map<int, int> clusterSize;
182 for ( auto it = idToObj.constBegin(); it != idToObj.constEnd(); ++it )
183 {
184 clusterSize[clusterFeatures[it.value()].cluster]++;
185 }
186
187 features = source->getFeatures();
188 i = 0;
189 while ( features.nextFeature( feat ) )
190 {
191 i++;
192 if ( feedback->isCanceled() )
193 {
194 break;
195 }
196
197 feedback->setProgress( 50 + i * step );
198 QgsAttributes attr = feat.attributes();
199 const auto obj = idToObj.find( feat.id() );
200 if ( !feat.hasGeometry() || obj == idToObj.end() )
201 {
202 attr << QVariant() << QVariant();
203 }
204 else if ( k <= 1 )
205 {
206 attr << 0 << featureWithGeometryCount;
207 }
208 else
209 {
210 const int cluster = clusterFeatures[*obj].cluster;
211 attr << cluster << clusterSize[cluster];
212 }
213 feat.setAttributes( attr );
214 if ( !sink->addFeature( feat, QgsFeatureSink::FastInsert ) )
215 throw QgsProcessingException( writeFeatureError( sink.get(), parameters, QStringLiteral( "OUTPUT" ) ) );
216 }
217
218 sink->finalize();
219
220 QVariantMap outputs;
221 outputs.insert( QStringLiteral( "OUTPUT" ), dest );
222 return outputs;
223}
224
225// ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
226
227void QgsKMeansClusteringAlgorithm::initClustersFarthestPoints( std::vector<Feature> &points, std::vector<QgsPointXY> &centers, const int k, QgsProcessingFeedback *feedback )
228{
229 const std::size_t n = points.size();
230 if ( n == 0 )
231 return;
232
233 if ( n == 1 )
234 {
235 for ( int i = 0; i < k; i++ )
236 centers[i] = points[0].point;
237 return;
238 }
239
240 std::size_t duplicateCount = 1;
241 // initially scan for two most distance points from each other, p1 and p2
242 std::size_t p1 = 0;
243 std::size_t p2 = 0;
244 double distanceP1 = 0;
245 double distanceP2 = 0;
246 double maxDistance = -1;
247 for ( std::size_t i = 1; i < n; i++ )
248 {
249 distanceP1 = points[i].point.sqrDist( points[p1].point );
250 distanceP2 = points[i].point.sqrDist( points[p2].point );
251
252 // if this point is further then existing candidates, replace our choice
253 if ( ( distanceP1 > maxDistance ) || ( distanceP2 > maxDistance ) )
254 {
255 maxDistance = std::max( distanceP1, distanceP2 );
256 if ( distanceP1 > distanceP2 )
257 p2 = i;
258 else
259 p1 = i;
260 }
261
262 // also record count of duplicate points
263 if ( qgsDoubleNear( distanceP1, 0 ) || qgsDoubleNear( distanceP2, 0 ) )
264 duplicateCount++;
265 }
266
267 if ( feedback && duplicateCount > 1 )
268 {
269 feedback->pushWarning( QObject::tr( "There are at least %n duplicate input(s), the number of output clusters may be less than was requested", nullptr, static_cast< int >( duplicateCount ) ) );
270 }
271
272 // By now two points should be found and be not the same
273 // Q_ASSERT( p1 != p2 && maxDistance >= 0 );
274
275 // Accept these two points as our initial centers
276 centers[0] = points[p1].point;
277 centers[1] = points[p2].point;
278
279 if ( k > 2 )
280 {
281 // array of minimum distance to a point from accepted cluster centers
282 std::vector<double> distances( n );
283
284 // initialize array with distance to first object
285 for ( std::size_t j = 0; j < n; j++ )
286 {
287 distances[j] = points[j].point.sqrDist( centers[0] );
288 }
289 distances[p1] = -1;
290 distances[p2] = -1;
291
292 // loop i on clusters, skip 0 and 1 as found already
293 for ( int i = 2; i < k; i++ )
294 {
295 std::size_t candidateCenter = 0;
296 double maxDistance = std::numeric_limits<double>::lowest();
297
298 // loop j on points
299 for ( std::size_t j = 0; j < n; j++ )
300 {
301 // accepted clusters are already marked with distance = -1
302 if ( distances[j] < 0 )
303 continue;
304
305 // update minimal distance with previously accepted cluster
306 distances[j] = std::min( points[j].point.sqrDist( centers[i - 1] ), distances[j] );
307
308 // greedily take a point that's farthest from any of accepted clusters
309 if ( distances[j] > maxDistance )
310 {
311 candidateCenter = j;
312 maxDistance = distances[j];
313 }
314 }
315
316 // checked earlier by counting entries on input, just in case
317 Q_ASSERT( maxDistance >= 0 );
318
319 // accept candidate to centers
320 distances[candidateCenter] = -1;
321 // copy the point coordinates into the initial centers array
322 centers[i] = points[candidateCenter].point;
323 }
324 }
325}
326
327void QgsKMeansClusteringAlgorithm::initClustersPlusPlus( std::vector<Feature> &points, std::vector<QgsPointXY> &centers, const int k, QgsProcessingFeedback *feedback )
328{
329 const std::size_t n = points.size();
330 if ( n == 0 )
331 return;
332
333 if ( n == 1 )
334 {
335 for ( int i = 0; i < k; i++ )
336 centers[i] = points[0].point;
337 return;
338 }
339
340 // randomly select the first point
341 std::random_device rd;
342 std::mt19937 gen( rd() );
343 std::uniform_int_distribution<size_t> distrib( 0, n - 1 );
344
345 std::size_t p1 = distrib( gen );
346 centers[0] = points[p1].point;
347
348 // calculate distances and total error (sum of distances of points to center)
349 std::vector<double> distances( n );
350 double totalError = 0;
351 std::size_t duplicateCount = 1;
352 for ( size_t i = 0; i < n; i++ )
353 {
354 double distance = points[i].point.sqrDist( centers[0] );
355 distances[i] = distance;
356 totalError += distance;
357 if ( qgsDoubleNear( distance, 0 ) )
358 {
359 duplicateCount++;
360 }
361 }
362 if ( feedback && duplicateCount > 1 )
363 {
364 feedback->pushWarning( QObject::tr( "There are at least %n duplicate input(s), the number of output clusters may be less than was requested", nullptr, static_cast< int >( duplicateCount ) ) );
365 }
366
367 // greedy kmeans++
368 // test not only one center but L possible centers
369 // chosen independently according to the same probability distribution), and then among these L
370 // centers, the one that decreases the k-means cost the most is chosen
371 // Bhattacharya, Anup & Eube, Jan & Röglin, Heiko & Schmidt, Melanie. (2019). Noisy, greedy and Not So greedy k-means++
372 unsigned int numCandidateCenters = 2 + static_cast< int >( std::floor( std::log( k ) ) );
373 std::vector<double> randomNumbers( numCandidateCenters );
374 std::vector<size_t> candidateCenters( numCandidateCenters );
375
376 std::uniform_real_distribution<double> dis( 0.0, 1.0 );
377 for ( int i = 1; i < k; i++ )
378 {
379 // sampling with probability proportional to the squared distance to the closest existing center
380 for ( unsigned int j = 0; j < numCandidateCenters; j++ )
381 {
382 randomNumbers[j] = dis( gen ) * totalError;
383 }
384
385 // cumulative sum, keep distances for later use
386 std::vector<double> cumSum = distances;
387 for ( size_t j = 1; j < n; j++ )
388 {
389 cumSum[j] += cumSum[j - 1];
390 }
391
392 // binary search for the index of the first element greater than or equal to random numbers
393 for ( unsigned int j = 0; j < numCandidateCenters; j++ )
394 {
395 size_t low = 0;
396 size_t high = n - 1;
397
398 while ( low <= high )
399 {
400 size_t mid = low + ( high - low ) / 2;
401 if ( cumSum[mid] < randomNumbers[j] )
402 {
403 low = mid + 1;
404 }
405 else
406 {
407 // size_t cannot be negative
408 if ( mid == 0 )
409 break;
410
411 high = mid - 1;
412 }
413 }
414 // clip candidate center to the number of points
415 if ( low >= n )
416 {
417 low = n - 1;
418 }
419 candidateCenters[j] = low;
420 }
421
422 std::vector<std::vector<double>> distancesCandidateCenters( numCandidateCenters, std::vector<double>( n ) );
423 ;
424
425 // store distances between previous and new candidate center, error and best candidate index
426 double currentError = 0;
427 double lowestError = std::numeric_limits<double>::max();
428 unsigned int bestCandidateIndex = 0;
429 for ( unsigned int j = 0; j < numCandidateCenters; j++ )
430 {
431 for ( size_t z = 0; z < n; z++ )
432 {
433 // distance to candidate center
434 double distance = points[candidateCenters[j]].point.sqrDist( points[z].point );
435 // if distance to previous center is less than the current distance, use that
436 if ( distance > distances[z] )
437 {
438 distance = distances[z];
439 }
440 distancesCandidateCenters[j][z] = distance;
441 currentError += distance;
442 }
443 if ( lowestError > currentError )
444 {
445 lowestError = currentError;
446 bestCandidateIndex = j;
447 }
448 }
449
450 // update distances with the best candidate center values
451 for ( size_t j = 0; j < n; j++ )
452 {
453 distances[j] = distancesCandidateCenters[bestCandidateIndex][j];
454 }
455 // store the best candidate center
456 centers[i] = points[candidateCenters[bestCandidateIndex]].point;
457 // update error
458 totalError = lowestError;
459 }
460}
461
462// ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
463
464void QgsKMeansClusteringAlgorithm::calculateKMeans( std::vector<QgsKMeansClusteringAlgorithm::Feature> &objs, std::vector<QgsPointXY> &centers, int k, QgsProcessingFeedback *feedback )
465{
466 int converged = false;
467 bool changed = false;
468
469 // avoid reallocating weights array for every iteration
470 std::vector<uint> weights( k );
471
472 uint i = 0;
473 for ( i = 0; i < KMEANS_MAX_ITERATIONS && !converged; i++ )
474 {
475 if ( feedback && feedback->isCanceled() )
476 break;
477
478 findNearest( objs, centers, k, changed );
479 updateMeans( objs, centers, weights, k );
480 converged = !changed;
481 }
482
483 if ( !converged && feedback )
484 feedback->reportError( QObject::tr( "Clustering did not converge after %n iteration(s)", nullptr, static_cast<int>( i ) ) );
485 else if ( feedback )
486 feedback->pushInfo( QObject::tr( "Clustering converged after %n iteration(s)", nullptr, static_cast<int>( i ) ) );
487}
488
489// ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
490
491void QgsKMeansClusteringAlgorithm::findNearest( std::vector<QgsKMeansClusteringAlgorithm::Feature> &points, const std::vector<QgsPointXY> &centers, const int k, bool &changed )
492{
493 changed = false;
494 const std::size_t n = points.size();
495 for ( std::size_t i = 0; i < n; i++ )
496 {
497 Feature &point = points[i];
498
499 // Initialize with distance to first cluster
500 double currentDistance = point.point.sqrDist( centers[0] );
501 int currentCluster = 0;
502
503 // Check all other cluster centers and find the nearest
504 for ( int cluster = 1; cluster < k; cluster++ )
505 {
506 const double distance = point.point.sqrDist( centers[cluster] );
507 if ( distance < currentDistance )
508 {
509 currentDistance = distance;
510 currentCluster = cluster;
511 }
512 }
513
514 // Store the nearest cluster this object is in
515 if ( point.cluster != currentCluster )
516 {
517 changed = true;
518 point.cluster = currentCluster;
519 }
520 }
521}
522
523// ported from https://github.com/postgis/postgis/blob/svn-trunk/liblwgeom/lwkmeans.c
524
525void QgsKMeansClusteringAlgorithm::updateMeans( const std::vector<Feature> &points, std::vector<QgsPointXY> &centers, std::vector<uint> &weights, const int k )
526{
527 const uint n = points.size();
528 std::fill( weights.begin(), weights.end(), 0 );
529 for ( int i = 0; i < k; i++ )
530 {
531 centers[i].setX( 0.0 );
532 centers[i].setY( 0.0 );
533 }
534 for ( uint i = 0; i < n; i++ )
535 {
536 const int cluster = points[i].cluster;
537 centers[cluster] += QgsVector( points[i].point.x(), points[i].point.y() );
538 weights[cluster] += 1;
539 }
540 for ( int i = 0; i < k; i++ )
541 {
542 centers[i] /= weights[i];
543 }
544}
545
546
@ VectorAnyGeometry
Any vector layer with geometry.
@ Advanced
Parameter is an advanced parameter which should be hidden from users by default.
A vector of attributes.
Wrapper for iterator of features from vector data provider or vector layer.
bool nextFeature(QgsFeature &f)
Fetch next feature and stores in f, returns true on success.
Wraps a request for features to a vector layer (or directly its vector data provider).
@ FastInsert
Use faster inserts, at the cost of updating the passed features to reflect changes made at the provid...
The feature class encapsulates a single feature including its unique ID, geometry and a list of field...
Definition qgsfeature.h:58
QgsAttributes attributes
Definition qgsfeature.h:67
QgsFeatureId id
Definition qgsfeature.h:66
void setAttributes(const QgsAttributes &attrs)
Sets the feature's attributes.
QgsGeometry geometry
Definition qgsfeature.h:69
bool hasGeometry() const
Returns true if the feature has an associated geometry.
bool isCanceled() const
Tells whether the operation has been canceled already.
Definition qgsfeedback.h:53
void setProgress(double progress)
Sets the current progress for the feedback object.
Definition qgsfeedback.h:61
Encapsulate a field in an attribute table or data source.
Definition qgsfield.h:53
Container of fields for a vector layer.
Definition qgsfields.h:46
bool append(const QgsField &field, Qgis::FieldOrigin origin=Qgis::FieldOrigin::Provider, int originIndex=-1)
Appends a field.
Definition qgsfields.cpp:70
A geometry is the spatial representation of a feature.
const QgsAbstractGeometry * constGet() const
Returns a non-modifiable (const) reference to the underlying abstract geometry primitive.
QgsGeometry centroid() const
Returns the center of mass of a geometry.
Qgis::WkbType wkbType() const
Returns type of the geometry as a WKB type (point / linestring / polygon etc.)
Represents a 2D point.
Definition qgspointxy.h:60
Contains information about the context in which a processing algorithm is executed.
Custom exception class for processing related exceptions.
Base class for providing feedback from a processing algorithm.
virtual void pushInfo(const QString &info)
Pushes a general informational message from the algorithm.
virtual void pushWarning(const QString &warning)
Pushes a warning informational message from the algorithm.
virtual void reportError(const QString &error, bool fatalError=false)
Reports that the algorithm encountered an error while executing.
An enum based parameter for processing algorithms, allowing for selection from predefined values.
A feature sink output for processing algorithms.
An input feature source (such as vector layers) parameter for processing algorithms.
A numeric parameter for processing algorithms.
static QgsFields combineFields(const QgsFields &fieldsA, const QgsFields &fieldsB, const QString &fieldsBPrefix=QString())
Combines two field lists, avoiding duplicate field names (in a case-insensitive manner).
Represent a 2-dimensional vector.
Definition qgsvector.h:30
static Qgis::WkbType flatType(Qgis::WkbType type)
Returns the flat type for a WKB type.
bool qgsDoubleNear(double a, double b, double epsilon=4 *std::numeric_limits< double >::epsilon())
Compare two doubles (but allow some difference)
Definition qgis.h:6367