# Sample data  
xval <- rnorm(12, mean = rep(1:3, each = 4), sd = 0.2) 
yval <- rnorm(12, mean = rep(c(1,2,1), each = 4), sd = 0.2) 

# Kmeans function 
kclus <- function(x, y, nclus) { 

    # start with random cluster centers 
    xcen <- runif(n = nclus, min = min(x), max = max(x)) 
    ycen <- runif(n = nclus, min = min(y), max = max(y)) 

    # data points and cluster assignment in "data" 
    # cluster coordinates in "clus" 
    data <- data.frame(xval = x, yval = y, clus = NA) 
    clus <- data.frame(name = 1:nclus, xcen = xcen, ycen = ycen) 

    finish <- FALSE 

    while(finish == FALSE) { 

     # assign cluster with minimum distance to each data point 
     for(i in 1:length(x)) { 
      dist <- sqrt((x[i]-clus$xcen)^2 + (y[i]-clus$ycen)^2) 
      data$clus[i] <- which.min(dist) 

     xcen_old <- clus$xcen 
     ycen_old <- clus$ycen 

     # calculate new cluster centers 
     for(i in 1:nclus) { 
      clus[i,2] <- mean(subset(data$xval, data$clus == i)) 
      clus[i,3] <- mean(subset(data$yval, data$clus == i)) 

     # stop the loop if there is no change in cluster coordinates 
     if(identical(xcen_old, clus$xcen) & identical(ycen_old, clus$ycen)) finish <- TRUE 

# apply kmeans function to sample data 
cluster <- kclus(xval, yval, 4) 

# plot the result 
ggplot(cluster, aes(xval, yval, color = as.factor(clus))) + geom_point() 




# Sample data  
xval <- rnorm(12, mean = rep(1:3, each = 4), sd = 0.2) 
yval <- rnorm(12, mean = rep(c(1,2,1), each = 4), sd = 0.2) 

# Kmeans function with random.seed for initialization 
kclus <- function(x, y, nclus, random.seed=123) { 

    # start with random cluster centers 
    xcen <- runif(n = nclus, min = min(x), max = max(x)) 
    ycen <- runif(n = nclus, min = min(y), max = max(y)) 

    # data points and cluster assignment in "data" 
    # cluster coordinates in "clus" 
    data <- data.frame(xval = x, yval = y, clus = NA) 
    clus <- data.frame(name = 1:nclus, xcen = xcen, ycen = ycen) 

    finish <- FALSE 

    while(finish == FALSE) { 

    # assign cluster with minimum distance to each data point 
    for(i in 1:length(x)) { 
     dist <- sqrt((x[i]-clus$xcen)^2 + (y[i]-clus$ycen)^2) 
     data$clus[i] <- which.min(dist) 

    xcen_old <- clus$xcen 
    ycen_old <- clus$ycen 

    # calculate new cluster centers 
    for(i in 1:nclus) { 
     clus[i,2] <- mean(subset(data$xval, data$clus == i)) 
     clus[i,3] <- mean(subset(data$yval, data$clus == i)) 

    # stop the loop if there is no change in cluster coordinates 
    if(identical(xcen_old, clus$xcen) & identical(ycen_old, clus$ycen)) finish <- TRUE 

# with default random seed 123, you should be able to reproduce the result 
# as you can see, in this case, no data points were assigned to the 4th cluster 
cluster <- kclus(xval, yval, 4) 
cluster.centers <- aggregate(.~clus, cluster, mean) 
ggplot(cluster, aes(xval, yval, color = as.factor(clus))) + 
    geom_point(size=5) + 
    geom_point(data=cluster.centers, aes(xval, yval, col=as.factor(clus)), pch=8, size=5) 

# run with a different random seed = 12 
# as you can see, in this case, the algorithm outputs 4 clusters, with the 2nd cluster having a single datapoint assigned to 
    cluster <- kclus(xval, yval, 4, 12) 
    cluster.centers <- aggregate(.~clus, cluster, mean) 
    ggplot(cluster, aes(xval, yval, color = as.factor(clus))) + 
     geom_point(size=5) + 
     geom_point(data=cluster.centers, aes(xval, yval, col=as.factor(clus)), pch=8, size=5) 

# run with a different random seed = 12345 
# as you can see, in this case, the algorithm outputs 2 clusters, with the all the datapoints assigned to the 1st and the 2nd cluster 
    cluster <- kclus(xval, yval, 4, 12345) 
    cluster.centers <- aggregate(.~clus, cluster, mean) 
    ggplot(cluster, aes(xval, yval, color = as.factor(clus))) + 
     geom_point(size=5) + 
     geom_point(data=cluster.centers, aes(xval, yval, col=as.factor(clus)), pch=8, size=5) 

  1. まずアルゴリズムを複数回ランダムに初期化したセンターで実行し、最高のクラスター品質(SSEなどで測定)の結果を選択できます。
  2. もう1つお試しできるのは、 Kmeans ++のスマートな初期化です。 A-それほど良くない-選択は、クラスタの再割り当ては、それが kのそれぞれは、(= 4)のクラスタがそれにasigned少なくとも1点を持っている(そうでない場合は、その後 ことを保証しながら、確実 にあなたのアルゴリズムを変更することができ
  3. 再割り当てしないでください)。
  4. 最後に、 階層型クラスタリングのような他のアルゴリズムを試すと、 樹枝図を使用してより多くの柔軟性を与え、必要な数のクラスタを選択できます。






