19#include <unordered_map> 
   24constexpr uint KMEANS_MAX_ITERATIONS = 1000;
 
   26QString QgsKMeansClusteringAlgorithm::name()
 const 
   28  return QStringLiteral( 
"kmeansclustering" );
 
   31QString QgsKMeansClusteringAlgorithm::displayName()
 const 
   33  return QObject::tr( 
"K-means clustering" );
 
   36QStringList QgsKMeansClusteringAlgorithm::tags()
 const 
   38  return QObject::tr( 
"clustering,clusters,kmeans,points" ).split( 
',' );
 
   41QString QgsKMeansClusteringAlgorithm::group()
 const 
   43  return QObject::tr( 
"Vector analysis" );
 
   46QString QgsKMeansClusteringAlgorithm::groupId()
 const 
   48  return QStringLiteral( 
"vectoranalysis" );
 
   51void QgsKMeansClusteringAlgorithm::initAlgorithm( 
const QVariantMap & )
 
   56  QStringList initializationMethods;
 
   57  initializationMethods << QObject::tr( 
"Farthest points" )
 
   58                        << QObject::tr( 
"K-means++" );
 
   59  addParameter( 
new QgsProcessingParameterEnum( QStringLiteral( 
"METHOD" ), QObject::tr( 
"Method" ), initializationMethods, 
false, 0, 
false ) );
 
   61  auto fieldNameParam = std::make_unique<QgsProcessingParameterString>( QStringLiteral( 
"FIELD_NAME" ), QObject::tr( 
"Cluster field name" ), QStringLiteral( 
"CLUSTER_ID" ) );
 
   63  addParameter( fieldNameParam.release() );
 
   64  auto sizeFieldNameParam = std::make_unique<QgsProcessingParameterString>( QStringLiteral( 
"SIZE_FIELD_NAME" ), QObject::tr( 
"Cluster size field name" ), QStringLiteral( 
"CLUSTER_SIZE" ) );
 
   66  addParameter( sizeFieldNameParam.release() );
 
   71QString QgsKMeansClusteringAlgorithm::shortHelpString()
 const 
   73  return QObject::tr( 
"This algorithm calculates the 2D distance based k-means cluster number for each input feature.\n\n" 
   74                      "If input geometries are lines or polygons, the clustering is based on the centroid of the feature.\n\n" 
   76                      "Arthur, David & Vassilvitskii, Sergei. (2007). K-Means++: The Advantages of Careful Seeding. Proc. of the Annu. ACM-SIAM Symp. on Discrete Algorithms. 8.\n\n" 
   77                      "Bhattacharya, Anup & Eube, Jan & Röglin, Heiko & Schmidt, Melanie. (2019). Noisy, Greedy and Not So Greedy k-means++" );
 
   80QString QgsKMeansClusteringAlgorithm::shortDescription()
 const 
   82  return QObject::tr( 
"Calculates the 2D distance based k-means cluster number for each input feature." );
 
   85QgsKMeansClusteringAlgorithm *QgsKMeansClusteringAlgorithm::createInstance()
 const 
   87  return new QgsKMeansClusteringAlgorithm();
 
   92  std::unique_ptr<QgsProcessingFeatureSource> source( parameterAsSource( parameters, QStringLiteral( 
"INPUT" ), context ) );
 
   96  int k = parameterAsInt( parameters, QStringLiteral( 
"CLUSTERS" ), context );
 
   97  int initializationMethod = parameterAsInt( parameters, QStringLiteral( 
"METHOD" ), context );
 
   99  QgsFields outputFields = source->fields();
 
  101  const QString clusterFieldName = parameterAsString( parameters, QStringLiteral( 
"FIELD_NAME" ), context );
 
  102  newFields.
append( 
QgsField( clusterFieldName, QMetaType::Type::Int ) );
 
  103  const QString clusterSizeFieldName = parameterAsString( parameters, QStringLiteral( 
"SIZE_FIELD_NAME" ), context );
 
  104  newFields.
append( 
QgsField( clusterSizeFieldName, QMetaType::Type::Int ) );
 
  108  std::unique_ptr<QgsFeatureSink> sink( parameterAsSink( parameters, QStringLiteral( 
"OUTPUT" ), context, dest, outputFields, source->wkbType(), source->sourceCrs() ) );
 
  113  feedback->
pushInfo( QObject::tr( 
"Collecting input points" ) );
 
  114  const double step = source->featureCount() > 0 ? 50.0 / 
static_cast< double >( source->featureCount() ) : 1;
 
  117  int featureWithGeometryCount = 0;
 
  120  std::vector<Feature> clusterFeatures;
 
  122  QHash<QgsFeatureId, std::size_t> idToObj;
 
  134    featureWithGeometryCount++;
 
  150    idToObj[feat.
id()] = clusterFeatures.size();
 
  151    clusterFeatures.emplace_back( Feature( point ) );
 
  156    feedback->
reportError( QObject::tr( 
"Number of geometries is less than the number of clusters requested, not all clusters will get data" ) );
 
  162    feedback->
pushInfo( QObject::tr( 
"Calculating clusters" ) );
 
  165    std::vector<QgsPointXY> centers( k );
 
  166    switch ( initializationMethod )
 
  169        initClustersFarthestPoints( clusterFeatures, centers, k, feedback );
 
  172        initClustersPlusPlus( clusterFeatures, centers, k, feedback );
 
  177    calculateKMeans( clusterFeatures, centers, k, feedback );
 
  181  std::unordered_map<int, int> clusterSize;
 
  182  for ( 
auto it = idToObj.constBegin(); it != idToObj.constEnd(); ++it )
 
  184    clusterSize[clusterFeatures[it.value()].cluster]++;
 
  187  features = source->getFeatures();
 
  199    const auto obj = idToObj.find( feat.
id() );
 
  202      attr << QVariant() << QVariant();
 
  206      attr << 0 << featureWithGeometryCount;
 
  210      const int cluster = clusterFeatures[*obj].cluster;
 
  211      attr << cluster << clusterSize[cluster];
 
  221  outputs.insert( QStringLiteral( 
"OUTPUT" ), dest );
 
  227void QgsKMeansClusteringAlgorithm::initClustersFarthestPoints( std::vector<Feature> &points, std::vector<QgsPointXY> ¢ers, 
const int k, 
QgsProcessingFeedback *feedback )
 
  229  const std::size_t n = points.size();
 
  235    for ( 
int i = 0; i < k; i++ )
 
  236      centers[i] = points[0].point;
 
  240  std::size_t duplicateCount = 1;
 
  244  double distanceP1 = 0;
 
  245  double distanceP2 = 0;
 
  246  double maxDistance = -1;
 
  247  for ( std::size_t i = 1; i < n; i++ )
 
  249    distanceP1 = points[i].point.sqrDist( points[p1].point );
 
  250    distanceP2 = points[i].point.sqrDist( points[p2].point );
 
  253    if ( ( distanceP1 > maxDistance ) || ( distanceP2 > maxDistance ) )
 
  255      maxDistance = std::max( distanceP1, distanceP2 );
 
  256      if ( distanceP1 > distanceP2 )
 
  267  if ( feedback && duplicateCount > 1 )
 
  269    feedback->
pushWarning( QObject::tr( 
"There are at least %n duplicate input(s), the number of output clusters may be less than was requested", 
nullptr, 
static_cast< int >( duplicateCount ) ) );
 
  276  centers[0] = points[p1].point;
 
  277  centers[1] = points[p2].point;
 
  282    std::vector<double> distances( n );
 
  285    for ( std::size_t j = 0; j < n; j++ )
 
  287      distances[j] = points[j].point.sqrDist( centers[0] );
 
  293    for ( 
int i = 2; i < k; i++ )
 
  295      std::size_t candidateCenter = 0;
 
  296      double maxDistance = std::numeric_limits<double>::lowest();
 
  299      for ( std::size_t j = 0; j < n; j++ )
 
  302        if ( distances[j] < 0 )
 
  306        distances[j] = std::min( points[j].point.sqrDist( centers[i - 1] ), distances[j] );
 
  309        if ( distances[j] > maxDistance )
 
  312          maxDistance = distances[j];
 
  317      Q_ASSERT( maxDistance >= 0 );
 
  320      distances[candidateCenter] = -1;
 
  322      centers[i] = points[candidateCenter].point;
 
  327void QgsKMeansClusteringAlgorithm::initClustersPlusPlus( std::vector<Feature> &points, std::vector<QgsPointXY> ¢ers, 
const int k, 
QgsProcessingFeedback *feedback )
 
  329  const std::size_t n = points.size();
 
  335    for ( 
int i = 0; i < k; i++ )
 
  336      centers[i] = points[0].point;
 
  341  std::random_device rd;
 
  342  std::mt19937 gen( rd() );
 
  343  std::uniform_int_distribution<size_t> distrib( 0, n - 1 );
 
  345  std::size_t p1 = distrib( gen );
 
  346  centers[0] = points[p1].point;
 
  349  std::vector<double> distances( n );
 
  350  double totalError = 0;
 
  351  std::size_t duplicateCount = 1;
 
  352  for ( 
size_t i = 0; i < n; i++ )
 
  354    double distance = points[i].point.sqrDist( centers[0] );
 
  355    distances[i] = distance;
 
  356    totalError += distance;
 
  362  if ( feedback && duplicateCount > 1 )
 
  364    feedback->
pushWarning( QObject::tr( 
"There are at least %n duplicate input(s), the number of output clusters may be less than was requested", 
nullptr, 
static_cast< int >( duplicateCount ) ) );
 
  372  unsigned int numCandidateCenters = 2 + 
static_cast< int >( std::floor( std::log( k ) ) );
 
  373  std::vector<double> randomNumbers( numCandidateCenters );
 
  374  std::vector<size_t> candidateCenters( numCandidateCenters );
 
  376  std::uniform_real_distribution<double> dis( 0.0, 1.0 );
 
  377  for ( 
int i = 1; i < k; i++ )
 
  380    for ( 
unsigned int j = 0; j < numCandidateCenters; j++ )
 
  382      randomNumbers[j] = dis( gen ) * totalError;
 
  386    std::vector<double> cumSum = distances;
 
  387    for ( 
size_t j = 1; j < n; j++ )
 
  389      cumSum[j] += cumSum[j - 1];
 
  393    for ( 
unsigned int j = 0; j < numCandidateCenters; j++ )
 
  398      while ( low <= high )
 
  400        size_t mid = low + ( high - low ) / 2;
 
  401        if ( cumSum[mid] < randomNumbers[j] )
 
  419      candidateCenters[j] = low;
 
  422    std::vector<std::vector<double>> distancesCandidateCenters( numCandidateCenters, std::vector<double>( n ) );
 
  426    double currentError = 0;
 
  427    double lowestError = std::numeric_limits<double>::max();
 
  428    unsigned int bestCandidateIndex = 0;
 
  429    for ( 
unsigned int j = 0; j < numCandidateCenters; j++ )
 
  431      for ( 
size_t z = 0; z < n; z++ )
 
  434        double distance = points[candidateCenters[j]].point.sqrDist( points[z].point );
 
  436        if ( distance > distances[z] )
 
  438          distance = distances[z];
 
  440        distancesCandidateCenters[j][z] = distance;
 
  441        currentError += distance;
 
  443      if ( lowestError > currentError )
 
  445        lowestError = currentError;
 
  446        bestCandidateIndex = j;
 
  451    for ( 
size_t j = 0; j < n; j++ )
 
  453      distances[j] = distancesCandidateCenters[bestCandidateIndex][j];
 
  456    centers[i] = points[candidateCenters[bestCandidateIndex]].point;
 
  458    totalError = lowestError;
 
  464void QgsKMeansClusteringAlgorithm::calculateKMeans( std::vector<QgsKMeansClusteringAlgorithm::Feature> &objs, std::vector<QgsPointXY> ¢ers, 
int k, 
QgsProcessingFeedback *feedback )
 
  466  int converged = 
false;
 
  467  bool changed = 
false;
 
  470  std::vector<uint> weights( k );
 
  473  for ( i = 0; i < KMEANS_MAX_ITERATIONS && !converged; i++ )
 
  478    findNearest( objs, centers, k, changed );
 
  479    updateMeans( objs, centers, weights, k );
 
  480    converged = !changed;
 
  483  if ( !converged && feedback )
 
  484    feedback->
reportError( QObject::tr( 
"Clustering did not converge after %n iteration(s)", 
nullptr, 
static_cast<int>( i ) ) );
 
  486    feedback->
pushInfo( QObject::tr( 
"Clustering converged after %n iteration(s)", 
nullptr, 
static_cast<int>( i ) ) );
 
  491void QgsKMeansClusteringAlgorithm::findNearest( std::vector<QgsKMeansClusteringAlgorithm::Feature> &points, 
const std::vector<QgsPointXY> ¢ers, 
const int k, 
bool &changed )
 
  494  const std::size_t n = points.size();
 
  495  for ( std::size_t i = 0; i < n; i++ )
 
  497    Feature &point = points[i];
 
  500    double currentDistance = point.point.sqrDist( centers[0] );
 
  501    int currentCluster = 0;
 
  504    for ( 
int cluster = 1; cluster < k; cluster++ )
 
  506      const double distance = point.point.sqrDist( centers[cluster] );
 
  507      if ( distance < currentDistance )
 
  509        currentDistance = distance;
 
  510        currentCluster = cluster;
 
  515    if ( point.cluster != currentCluster )
 
  518      point.cluster = currentCluster;
 
  525void QgsKMeansClusteringAlgorithm::updateMeans( 
const std::vector<Feature> &points, std::vector<QgsPointXY> ¢ers, std::vector<uint> &weights, 
const int k )
 
  527  const uint n = points.size();
 
  528  std::fill( weights.begin(), weights.end(), 0 );
 
  529  for ( 
int i = 0; i < k; i++ )
 
  531    centers[i].setX( 0.0 );
 
  532    centers[i].setY( 0.0 );
 
  534  for ( uint i = 0; i < n; i++ )
 
  536    const int cluster = points[i].cluster;
 
  537    centers[cluster] += 
QgsVector( points[i].point.x(), points[i].point.y() );
 
  538    weights[cluster] += 1;
 
  540  for ( 
int i = 0; i < k; i++ )
 
  542    centers[i] /= weights[i];
 
@ VectorAnyGeometry
Any vector layer with geometry.
 
@ Advanced
Parameter is an advanced parameter which should be hidden from users by default.
 
Wrapper for iterator of features from vector data provider or vector layer.
 
bool nextFeature(QgsFeature &f)
Fetch next feature and stores in f, returns true on success.
 
Wraps a request for features to a vector layer (or directly its vector data provider).
 
@ FastInsert
Use faster inserts, at the cost of updating the passed features to reflect changes made at the provid...
 
The feature class encapsulates a single feature including its unique ID, geometry and a list of field...
 
void setAttributes(const QgsAttributes &attrs)
Sets the feature's attributes.
 
bool hasGeometry() const
Returns true if the feature has an associated geometry.
 
bool isCanceled() const
Tells whether the operation has been canceled already.
 
void setProgress(double progress)
Sets the current progress for the feedback object.
 
Encapsulate a field in an attribute table or data source.
 
Container of fields for a vector layer.
 
bool append(const QgsField &field, Qgis::FieldOrigin origin=Qgis::FieldOrigin::Provider, int originIndex=-1)
Appends a field.
 
A geometry is the spatial representation of a feature.
 
const QgsAbstractGeometry * constGet() const
Returns a non-modifiable (const) reference to the underlying abstract geometry primitive.
 
QgsGeometry centroid() const
Returns the center of mass of a geometry.
 
Qgis::WkbType wkbType() const
Returns type of the geometry as a WKB type (point / linestring / polygon etc.)
 
Contains information about the context in which a processing algorithm is executed.
 
Custom exception class for processing related exceptions.
 
Base class for providing feedback from a processing algorithm.
 
virtual void pushInfo(const QString &info)
Pushes a general informational message from the algorithm.
 
virtual void pushWarning(const QString &warning)
Pushes a warning informational message from the algorithm.
 
virtual void reportError(const QString &error, bool fatalError=false)
Reports that the algorithm encountered an error while executing.
 
An enum based parameter for processing algorithms, allowing for selection from predefined values.
 
A feature sink output for processing algorithms.
 
An input feature source (such as vector layers) parameter for processing algorithms.
 
A numeric parameter for processing algorithms.
 
static QgsFields combineFields(const QgsFields &fieldsA, const QgsFields &fieldsB, const QString &fieldsBPrefix=QString())
Combines two field lists, avoiding duplicate field names (in a case-insensitive manner).
 
Represent a 2-dimensional vector.
 
static Qgis::WkbType flatType(Qgis::WkbType type)
Returns the flat type for a WKB type.
 
bool qgsDoubleNear(double a, double b, double epsilon=4 *std::numeric_limits< double >::epsilon())
Compare two doubles (but allow some difference)