19#include <unordered_map>
24constexpr uint KMEANS_MAX_ITERATIONS = 1000;
26QString QgsKMeansClusteringAlgorithm::name()
const
28 return QStringLiteral(
"kmeansclustering" );
31QString QgsKMeansClusteringAlgorithm::displayName()
const
33 return QObject::tr(
"K-means clustering" );
36QStringList QgsKMeansClusteringAlgorithm::tags()
const
38 return QObject::tr(
"clustering,clusters,kmeans,points" ).split(
',' );
41QString QgsKMeansClusteringAlgorithm::group()
const
43 return QObject::tr(
"Vector analysis" );
46QString QgsKMeansClusteringAlgorithm::groupId()
const
48 return QStringLiteral(
"vectoranalysis" );
51void QgsKMeansClusteringAlgorithm::initAlgorithm(
const QVariantMap & )
56 QStringList initializationMethods;
57 initializationMethods << QObject::tr(
"Farthest points" )
58 << QObject::tr(
"K-means++" );
59 addParameter(
new QgsProcessingParameterEnum( QStringLiteral(
"METHOD" ), QObject::tr(
"Method" ), initializationMethods,
false, 0,
false ) );
61 auto fieldNameParam = std::make_unique<QgsProcessingParameterString>( QStringLiteral(
"FIELD_NAME" ), QObject::tr(
"Cluster field name" ), QStringLiteral(
"CLUSTER_ID" ) );
63 addParameter( fieldNameParam.release() );
64 auto sizeFieldNameParam = std::make_unique<QgsProcessingParameterString>( QStringLiteral(
"SIZE_FIELD_NAME" ), QObject::tr(
"Cluster size field name" ), QStringLiteral(
"CLUSTER_SIZE" ) );
66 addParameter( sizeFieldNameParam.release() );
71QString QgsKMeansClusteringAlgorithm::shortHelpString()
const
73 return QObject::tr(
"This algorithm calculates the 2D distance based k-means cluster number for each input feature.\n\n"
74 "If input geometries are lines or polygons, the clustering is based on the centroid of the feature.\n\n"
76 "Arthur, David & Vassilvitskii, Sergei. (2007). K-Means++: The Advantages of Careful Seeding. Proc. of the Annu. ACM-SIAM Symp. on Discrete Algorithms. 8.\n\n"
77 "Bhattacharya, Anup & Eube, Jan & Röglin, Heiko & Schmidt, Melanie. (2019). Noisy, Greedy and Not So Greedy k-means++" );
80QString QgsKMeansClusteringAlgorithm::shortDescription()
const
82 return QObject::tr(
"Calculates the 2D distance based k-means cluster number for each input feature." );
85QgsKMeansClusteringAlgorithm *QgsKMeansClusteringAlgorithm::createInstance()
const
87 return new QgsKMeansClusteringAlgorithm();
92 std::unique_ptr<QgsProcessingFeatureSource> source( parameterAsSource( parameters, QStringLiteral(
"INPUT" ), context ) );
96 int k = parameterAsInt( parameters, QStringLiteral(
"CLUSTERS" ), context );
97 int initializationMethod = parameterAsInt( parameters, QStringLiteral(
"METHOD" ), context );
99 QgsFields outputFields = source->fields();
101 const QString clusterFieldName = parameterAsString( parameters, QStringLiteral(
"FIELD_NAME" ), context );
102 newFields.
append(
QgsField( clusterFieldName, QMetaType::Type::Int ) );
103 const QString clusterSizeFieldName = parameterAsString( parameters, QStringLiteral(
"SIZE_FIELD_NAME" ), context );
104 newFields.
append(
QgsField( clusterSizeFieldName, QMetaType::Type::Int ) );
108 std::unique_ptr<QgsFeatureSink> sink( parameterAsSink( parameters, QStringLiteral(
"OUTPUT" ), context, dest, outputFields, source->wkbType(), source->sourceCrs() ) );
113 feedback->
pushInfo( QObject::tr(
"Collecting input points" ) );
114 const double step = source->featureCount() > 0 ? 50.0 /
static_cast< double >( source->featureCount() ) : 1;
117 int featureWithGeometryCount = 0;
120 std::vector<Feature> clusterFeatures;
122 QHash<QgsFeatureId, std::size_t> idToObj;
134 featureWithGeometryCount++;
150 idToObj[feat.
id()] = clusterFeatures.size();
151 clusterFeatures.emplace_back( Feature( point ) );
156 feedback->
reportError( QObject::tr(
"Number of geometries is less than the number of clusters requested, not all clusters will get data" ) );
162 feedback->
pushInfo( QObject::tr(
"Calculating clusters" ) );
165 std::vector<QgsPointXY> centers( k );
166 switch ( initializationMethod )
169 initClustersFarthestPoints( clusterFeatures, centers, k, feedback );
172 initClustersPlusPlus( clusterFeatures, centers, k, feedback );
177 calculateKMeans( clusterFeatures, centers, k, feedback );
181 std::unordered_map<int, int> clusterSize;
182 for (
auto it = idToObj.constBegin(); it != idToObj.constEnd(); ++it )
184 clusterSize[clusterFeatures[it.value()].cluster]++;
187 features = source->getFeatures();
199 const auto obj = idToObj.find( feat.
id() );
202 attr << QVariant() << QVariant();
206 attr << 0 << featureWithGeometryCount;
210 const int cluster = clusterFeatures[*obj].cluster;
211 attr << cluster << clusterSize[cluster];
221 outputs.insert( QStringLiteral(
"OUTPUT" ), dest );
227void QgsKMeansClusteringAlgorithm::initClustersFarthestPoints( std::vector<Feature> &points, std::vector<QgsPointXY> ¢ers,
const int k,
QgsProcessingFeedback *feedback )
229 const std::size_t n = points.size();
235 for (
int i = 0; i < k; i++ )
236 centers[i] = points[0].point;
240 std::size_t duplicateCount = 1;
244 double distanceP1 = 0;
245 double distanceP2 = 0;
246 double maxDistance = -1;
247 for ( std::size_t i = 1; i < n; i++ )
249 distanceP1 = points[i].point.sqrDist( points[p1].point );
250 distanceP2 = points[i].point.sqrDist( points[p2].point );
253 if ( ( distanceP1 > maxDistance ) || ( distanceP2 > maxDistance ) )
255 maxDistance = std::max( distanceP1, distanceP2 );
256 if ( distanceP1 > distanceP2 )
267 if ( feedback && duplicateCount > 1 )
269 feedback->
pushWarning( QObject::tr(
"There are at least %n duplicate input(s), the number of output clusters may be less than was requested",
nullptr,
static_cast< int >( duplicateCount ) ) );
276 centers[0] = points[p1].point;
277 centers[1] = points[p2].point;
282 std::vector<double> distances( n );
285 for ( std::size_t j = 0; j < n; j++ )
287 distances[j] = points[j].point.sqrDist( centers[0] );
293 for (
int i = 2; i < k; i++ )
295 std::size_t candidateCenter = 0;
296 double maxDistance = std::numeric_limits<double>::lowest();
299 for ( std::size_t j = 0; j < n; j++ )
302 if ( distances[j] < 0 )
306 distances[j] = std::min( points[j].point.sqrDist( centers[i - 1] ), distances[j] );
309 if ( distances[j] > maxDistance )
312 maxDistance = distances[j];
317 Q_ASSERT( maxDistance >= 0 );
320 distances[candidateCenter] = -1;
322 centers[i] = points[candidateCenter].point;
327void QgsKMeansClusteringAlgorithm::initClustersPlusPlus( std::vector<Feature> &points, std::vector<QgsPointXY> ¢ers,
const int k,
QgsProcessingFeedback *feedback )
329 const std::size_t n = points.size();
335 for (
int i = 0; i < k; i++ )
336 centers[i] = points[0].point;
341 std::random_device rd;
342 std::mt19937 gen( rd() );
343 std::uniform_int_distribution<size_t> distrib( 0, n - 1 );
345 std::size_t p1 = distrib( gen );
346 centers[0] = points[p1].point;
349 std::vector<double> distances( n );
350 double totalError = 0;
351 std::size_t duplicateCount = 1;
352 for (
size_t i = 0; i < n; i++ )
354 double distance = points[i].point.sqrDist( centers[0] );
355 distances[i] = distance;
356 totalError += distance;
362 if ( feedback && duplicateCount > 1 )
364 feedback->
pushWarning( QObject::tr(
"There are at least %n duplicate input(s), the number of output clusters may be less than was requested",
nullptr,
static_cast< int >( duplicateCount ) ) );
372 unsigned int numCandidateCenters = 2 +
static_cast< int >( std::floor( std::log( k ) ) );
373 std::vector<double> randomNumbers( numCandidateCenters );
374 std::vector<size_t> candidateCenters( numCandidateCenters );
376 std::uniform_real_distribution<double> dis( 0.0, 1.0 );
377 for (
int i = 1; i < k; i++ )
380 for (
unsigned int j = 0; j < numCandidateCenters; j++ )
382 randomNumbers[j] = dis( gen ) * totalError;
386 std::vector<double> cumSum = distances;
387 for (
size_t j = 1; j < n; j++ )
389 cumSum[j] += cumSum[j - 1];
393 for (
unsigned int j = 0; j < numCandidateCenters; j++ )
398 while ( low <= high )
400 size_t mid = low + ( high - low ) / 2;
401 if ( cumSum[mid] < randomNumbers[j] )
419 candidateCenters[j] = low;
422 std::vector<std::vector<double>> distancesCandidateCenters( numCandidateCenters, std::vector<double>( n ) );
426 double currentError = 0;
427 double lowestError = std::numeric_limits<double>::max();
428 unsigned int bestCandidateIndex = 0;
429 for (
unsigned int j = 0; j < numCandidateCenters; j++ )
431 for (
size_t z = 0; z < n; z++ )
434 double distance = points[candidateCenters[j]].point.sqrDist( points[z].point );
436 if ( distance > distances[z] )
438 distance = distances[z];
440 distancesCandidateCenters[j][z] = distance;
441 currentError += distance;
443 if ( lowestError > currentError )
445 lowestError = currentError;
446 bestCandidateIndex = j;
451 for (
size_t j = 0; j < n; j++ )
453 distances[j] = distancesCandidateCenters[bestCandidateIndex][j];
456 centers[i] = points[candidateCenters[bestCandidateIndex]].point;
458 totalError = lowestError;
464void QgsKMeansClusteringAlgorithm::calculateKMeans( std::vector<QgsKMeansClusteringAlgorithm::Feature> &objs, std::vector<QgsPointXY> ¢ers,
int k,
QgsProcessingFeedback *feedback )
466 int converged =
false;
467 bool changed =
false;
470 std::vector<uint> weights( k );
473 for ( i = 0; i < KMEANS_MAX_ITERATIONS && !converged; i++ )
478 findNearest( objs, centers, k, changed );
479 updateMeans( objs, centers, weights, k );
480 converged = !changed;
483 if ( !converged && feedback )
484 feedback->
reportError( QObject::tr(
"Clustering did not converge after %n iteration(s)",
nullptr,
static_cast<int>( i ) ) );
486 feedback->
pushInfo( QObject::tr(
"Clustering converged after %n iteration(s)",
nullptr,
static_cast<int>( i ) ) );
491void QgsKMeansClusteringAlgorithm::findNearest( std::vector<QgsKMeansClusteringAlgorithm::Feature> &points,
const std::vector<QgsPointXY> ¢ers,
const int k,
bool &changed )
494 const std::size_t n = points.size();
495 for ( std::size_t i = 0; i < n; i++ )
497 Feature &point = points[i];
500 double currentDistance = point.point.sqrDist( centers[0] );
501 int currentCluster = 0;
504 for (
int cluster = 1; cluster < k; cluster++ )
506 const double distance = point.point.sqrDist( centers[cluster] );
507 if ( distance < currentDistance )
509 currentDistance = distance;
510 currentCluster = cluster;
515 if ( point.cluster != currentCluster )
518 point.cluster = currentCluster;
525void QgsKMeansClusteringAlgorithm::updateMeans(
const std::vector<Feature> &points, std::vector<QgsPointXY> ¢ers, std::vector<uint> &weights,
const int k )
527 const uint n = points.size();
528 std::fill( weights.begin(), weights.end(), 0 );
529 for (
int i = 0; i < k; i++ )
531 centers[i].setX( 0.0 );
532 centers[i].setY( 0.0 );
534 for ( uint i = 0; i < n; i++ )
536 const int cluster = points[i].cluster;
537 centers[cluster] +=
QgsVector( points[i].point.x(), points[i].point.y() );
538 weights[cluster] += 1;
540 for (
int i = 0; i < k; i++ )
542 centers[i] /= weights[i];
@ VectorAnyGeometry
Any vector layer with geometry.
@ Advanced
Parameter is an advanced parameter which should be hidden from users by default.
Wrapper for iterator of features from vector data provider or vector layer.
bool nextFeature(QgsFeature &f)
Fetch next feature and stores in f, returns true on success.
Wraps a request for features to a vector layer (or directly its vector data provider).
@ FastInsert
Use faster inserts, at the cost of updating the passed features to reflect changes made at the provid...
The feature class encapsulates a single feature including its unique ID, geometry and a list of field...
void setAttributes(const QgsAttributes &attrs)
Sets the feature's attributes.
bool hasGeometry() const
Returns true if the feature has an associated geometry.
bool isCanceled() const
Tells whether the operation has been canceled already.
void setProgress(double progress)
Sets the current progress for the feedback object.
Encapsulate a field in an attribute table or data source.
Container of fields for a vector layer.
bool append(const QgsField &field, Qgis::FieldOrigin origin=Qgis::FieldOrigin::Provider, int originIndex=-1)
Appends a field.
A geometry is the spatial representation of a feature.
const QgsAbstractGeometry * constGet() const
Returns a non-modifiable (const) reference to the underlying abstract geometry primitive.
QgsGeometry centroid() const
Returns the center of mass of a geometry.
Qgis::WkbType wkbType() const
Returns type of the geometry as a WKB type (point / linestring / polygon etc.)
Contains information about the context in which a processing algorithm is executed.
Custom exception class for processing related exceptions.
Base class for providing feedback from a processing algorithm.
virtual void pushInfo(const QString &info)
Pushes a general informational message from the algorithm.
virtual void pushWarning(const QString &warning)
Pushes a warning informational message from the algorithm.
virtual void reportError(const QString &error, bool fatalError=false)
Reports that the algorithm encountered an error while executing.
An enum based parameter for processing algorithms, allowing for selection from predefined values.
A feature sink output for processing algorithms.
An input feature source (such as vector layers) parameter for processing algorithms.
A numeric parameter for processing algorithms.
static QgsFields combineFields(const QgsFields &fieldsA, const QgsFields &fieldsB, const QString &fieldsBPrefix=QString())
Combines two field lists, avoiding duplicate field names (in a case-insensitive manner).
Represent a 2-dimensional vector.
static Qgis::WkbType flatType(Qgis::WkbType type)
Returns the flat type for a WKB type.
bool qgsDoubleNear(double a, double b, double epsilon=4 *std::numeric_limits< double >::epsilon())
Compare two doubles (but allow some difference)