K-means Clustering

Features of dataset:

eruptions - eruption time in minutes

waiting - waiting time to next eruption in minutes.

Given the data related to eruptions we need to cluster a particular eruption.

Import required libraries


# For mathematical calculation
import numpy as np

# For handling datasets
import pandas as pd

# For plotting graphs
from matplotlib import pyplot as plt

# Import the sklearn library for KMeans Clustering
from sklearn.cluster import KMeans

Import dataset


# Import the csv file
df = pd.read_csv('data.csv')

print df.head()
'''
Output:
   eruptions  waiting
0      3.600       79
1      1.800       54
2      3.333       74
3      2.283       62
4      4.533       85



'''

Train the model


# Assign the number of clusters
k = 2

kmeans = KMeans(n_clusters=k)

# Train the model
kmeans = kmeans.fit(df)

# array that contains cluster number 
labels = kmeans.labels_

# array of size k with co-ordinates of 
# centroids 
centroids = kmeans.cluster_centers_

Test the model


# Prepare the test data 
x_test = [[4.671,67],[2.885,61],[1.666,90],
          [5.623,54],[2.678,80],[1.875,60]]

#Test the model(returns the cluster number)
prediction = kmeans.predict(x_test)

print prediction
'''
Output:
[0 0 1 0 1 0]

As value of k is 2 
there are only two clusters 0 and 1.
'''

Plot the clusters.


# Plot the points representing their cluster
# cluster number 
colors = ['blue','red','green','black']
y = 0
for x in labels:
    # plot the points acc to their clusters
    # and assign different colors
    plt.scatter(df.iloc[y,0], df.iloc[y,1]
              ,color=colors[x])
    y+=1


        
for x in range(k):
    #plot the centroids
    lines = plt.plot(centroids[x,0]
                  ,centroids[x,1],'kx')    
    #make the centroid larger    
    plt.setp(lines,ms=15.0)
    plt.setp(lines,mew=2.0)
    
title = ('No of clusters (k) = {}').format(k)
plt.title(title)
plt.xlabel('eruptions (mins)')
plt.ylabel('waiting (mins)')
plt.show()