Allow to set relation for cluster size

2025-10-26 10:47:25 +01:00 · 2022-06-22 11:05:51 +02:00 · 2022-06-22 11:05:51 +02:00 · 0e4f4621ed
commit 0e4f4621ed
parent fddd0c3fa0
2 changed files with 24 additions and 3 deletions
--- a/R/method_clustering.R
+++ b/R/method_clustering.R
@ -14,11 +14,18 @@
 #'   etc.
 #' @param n_clusters Maximum number of clusters that should be taken into
 #'   account. By default, all clusters will be regarded.
 #' @param relation Number of items that the cluster size should be based on.
 #'   This should always at least the length of the data. By default, the length
 #'   of the data is used.
 #'
 #' @return A score between 0.0 and 1.0 summarizing how much the data clusters.
 #'
 #' @export
-clusteriness <- function(data, span = 100000, weight = 0.7, n_clusters = NULL) {
+clusteriness <- function(data,
                         span = 100000,
                         weight = 0.7,
                         n_clusters = NULL,
                         relation = NULL) {
  n <- length(data)
  # Return a score of 0.0 if there is just one or no value at all.
@ -26,6 +33,10 @@ clusteriness <- function(data, span = 100000, weight = 0.7, n_clusters = NULL) {
    return(0.0)
  }
  if (is.null(relation)) {
    relation <- n
  }
  # Cluster the data and compute the cluster sizes.
  tree <- stats::hclust(stats::dist(data))
@ -46,7 +57,7 @@ clusteriness <- function(data, span = 100000, weight = 0.7, n_clusters = NULL) {
    cluster_size <- cluster_sizes[i]
    if (cluster_size >= 2) {
-      cluster_score <- cluster_size / n
+      cluster_score <- cluster_size / relation
      score <- score + weight^(i - 1) * cluster_score
    }
  }
--- a/man/clusteriness.Rd
+++ b/man/clusteriness.Rd
@ -4,7 +4,13 @@
 \alias{clusteriness}
 \title{Perform a cluster analysis.}
 \usage{
-clusteriness(data, span = 1e+05, weight = 0.7, n_clusters = NULL)
+clusteriness(
  data,
  span = 1e+05,
  weight = 0.7,
  n_clusters = NULL,
  relation = NULL
 )
 }
 \arguments{
 \item{data}{The values that should be scored.}
@ -18,6 +24,10 @@ etc.}
 \item{n_clusters}{Maximum number of clusters that should be taken into
 account. By default, all clusters will be regarded.}
 \item{relation}{Number of items that the cluster size should be based on.
 This should always at least the length of the data. By default, the length
 of the data is used.}
 }
 \value{
 A score between 0.0 and 1.0 summarizing how much the data clusters.