Allow to set relation for cluster size

This commit is contained in:
Elias Projahn 2022-06-22 11:05:51 +02:00
parent fddd0c3fa0
commit 0e4f4621ed
2 changed files with 24 additions and 3 deletions

View file

@ -14,11 +14,18 @@
#' etc.
#' @param n_clusters Maximum number of clusters that should be taken into
#' account. By default, all clusters will be regarded.
#' @param relation Number of items that the cluster size should be based on.
#' This should always at least the length of the data. By default, the length
#' of the data is used.
#'
#' @return A score between 0.0 and 1.0 summarizing how much the data clusters.
#'
#' @export
clusteriness <- function(data, span = 100000, weight = 0.7, n_clusters = NULL) {
clusteriness <- function(data,
span = 100000,
weight = 0.7,
n_clusters = NULL,
relation = NULL) {
n <- length(data)
# Return a score of 0.0 if there is just one or no value at all.
@ -26,6 +33,10 @@ clusteriness <- function(data, span = 100000, weight = 0.7, n_clusters = NULL) {
return(0.0)
}
if (is.null(relation)) {
relation <- n
}
# Cluster the data and compute the cluster sizes.
tree <- stats::hclust(stats::dist(data))
@ -46,7 +57,7 @@ clusteriness <- function(data, span = 100000, weight = 0.7, n_clusters = NULL) {
cluster_size <- cluster_sizes[i]
if (cluster_size >= 2) {
cluster_score <- cluster_size / n
cluster_score <- cluster_size / relation
score <- score + weight^(i - 1) * cluster_score
}
}

View file

@ -4,7 +4,13 @@
\alias{clusteriness}
\title{Perform a cluster analysis.}
\usage{
clusteriness(data, span = 1e+05, weight = 0.7, n_clusters = NULL)
clusteriness(
data,
span = 1e+05,
weight = 0.7,
n_clusters = NULL,
relation = NULL
)
}
\arguments{
\item{data}{The values that should be scored.}
@ -18,6 +24,10 @@ etc.}
\item{n_clusters}{Maximum number of clusters that should be taken into
account. By default, all clusters will be regarded.}
\item{relation}{Number of items that the cluster size should be based on.
This should always at least the length of the data. By default, the length
of the data is used.}
}
\value{
A score between 0.0 and 1.0 summarizing how much the data clusters.