Allow to set relation for cluster size

2025-10-25 19:37:23 +02:00 · 2022-06-22 11:05:51 +02:00 · 2022-06-22 11:05:51 +02:00 · 0e4f4621ed
commit 0e4f4621ed
parent fddd0c3fa0
2 changed files with 24 additions and 3 deletions
--- a/R/method_clustering.R
+++ b/R/method_clustering.R
@ -14,11 +14,18 @@
 #'   etc.
 #' @param n_clusters Maximum number of clusters that should be taken into
 #'   account. By default, all clusters will be regarded.
+#' @param relation Number of items that the cluster size should be based on.
+#'   This should always at least the length of the data. By default, the length
+#'   of the data is used.
 #'
 #' @return A score between 0.0 and 1.0 summarizing how much the data clusters.
 #'
 #' @export
-clusteriness <- function(data, span = 100000, weight = 0.7, n_clusters = NULL) {
+clusteriness <- function(data,
+                         span = 100000,
+                         weight = 0.7,
+                         n_clusters = NULL,
+                         relation = NULL) {
  n <- length(data)

  # Return a score of 0.0 if there is just one or no value at all.
@ -26,6 +33,10 @@ clusteriness <- function(data, span = 100000, weight = 0.7, n_clusters = NULL) {
    return(0.0)
  }

+  if (is.null(relation)) {
+    relation <- n
+  }
+
  # Cluster the data and compute the cluster sizes.

  tree <- stats::hclust(stats::dist(data))
@ -46,7 +57,7 @@ clusteriness <- function(data, span = 100000, weight = 0.7, n_clusters = NULL) {
    cluster_size <- cluster_sizes[i]

    if (cluster_size >= 2) {
-      cluster_score <- cluster_size / n
+      cluster_score <- cluster_size / relation
      score <- score + weight^(i - 1) * cluster_score
    }
  }
--- a/man/clusteriness.Rd
+++ b/man/clusteriness.Rd
@ -4,7 +4,13 @@
 \alias{clusteriness}
 \title{Perform a cluster analysis.}
 \usage{
-clusteriness(data, span = 1e+05, weight = 0.7, n_clusters = NULL)
+clusteriness(
+  data,
+  span = 1e+05,
+  weight = 0.7,
+  n_clusters = NULL,
+  relation = NULL
+)
 }
 \arguments{
 \item{data}{The values that should be scored.}
@ -18,6 +24,10 @@ etc.}

 \item{n_clusters}{Maximum number of clusters that should be taken into
 account. By default, all clusters will be regarded.}
+
+\item{relation}{Number of items that the cluster size should be based on.
+This should always at least the length of the data. By default, the length
+of the data is used.}
 }
 \value{
 A score between 0.0 and 1.0 summarizing how much the data clusters.