diff --git a/R/method_clustering.R b/R/method_clustering.R index bb4a75c..18016d4 100644 --- a/R/method_clustering.R +++ b/R/method_clustering.R @@ -14,11 +14,18 @@ #' etc. #' @param n_clusters Maximum number of clusters that should be taken into #' account. By default, all clusters will be regarded. +#' @param relation Number of items that the cluster size should be based on. +#' This should always at least the length of the data. By default, the length +#' of the data is used. #' #' @return A score between 0.0 and 1.0 summarizing how much the data clusters. #' #' @export -clusteriness <- function(data, span = 100000, weight = 0.7, n_clusters = NULL) { +clusteriness <- function(data, + span = 100000, + weight = 0.7, + n_clusters = NULL, + relation = NULL) { n <- length(data) # Return a score of 0.0 if there is just one or no value at all. @@ -26,6 +33,10 @@ clusteriness <- function(data, span = 100000, weight = 0.7, n_clusters = NULL) { return(0.0) } + if (is.null(relation)) { + relation <- n + } + # Cluster the data and compute the cluster sizes. tree <- stats::hclust(stats::dist(data)) @@ -46,7 +57,7 @@ clusteriness <- function(data, span = 100000, weight = 0.7, n_clusters = NULL) { cluster_size <- cluster_sizes[i] if (cluster_size >= 2) { - cluster_score <- cluster_size / n + cluster_score <- cluster_size / relation score <- score + weight^(i - 1) * cluster_score } } diff --git a/man/clusteriness.Rd b/man/clusteriness.Rd index 2af8df4..2b9f952 100644 --- a/man/clusteriness.Rd +++ b/man/clusteriness.Rd @@ -4,7 +4,13 @@ \alias{clusteriness} \title{Perform a cluster analysis.} \usage{ -clusteriness(data, span = 1e+05, weight = 0.7, n_clusters = NULL) +clusteriness( + data, + span = 1e+05, + weight = 0.7, + n_clusters = NULL, + relation = NULL +) } \arguments{ \item{data}{The values that should be scored.} @@ -18,6 +24,10 @@ etc.} \item{n_clusters}{Maximum number of clusters that should be taken into account. By default, all clusters will be regarded.} + +\item{relation}{Number of items that the cluster size should be based on. +This should always at least the length of the data. By default, the length +of the data is used.} } \value{ A score between 0.0 and 1.0 summarizing how much the data clusters.