Source code for IoTPy.modules.ML.KMeans.kmeans

import numpy as np
from bokeh.models import ColumnDataSource, LinearColorMapper
from bokeh.plotting import curdoc, figure
from bokeh.client import push_session
from bokeh.palettes import Magma
import random

TIME_SLEEP = 0.000000001

[docs]def initialize(k, low, high): """Returns k random points with x and y coordinates in [low, high). Parameters ---------- k : int The number of points to return. low : int The lower bound (inclusive) for a point. high : int The upper bound (exclusive) for a point. Returns ------- centroids : numpy.ndarray Numpy array with dimensions `k` by 2. """ centroids = np.random.rand(k, 2) * (high - low) + low return centroids
[docs]def initializeCentroids(X, k): """Returns k random points from the data `X` without replacement. Parameters ---------- X : numpy.ndarray A numpy array with dimensions n * 2, where n >= `k`. k : int The number of points to return Returns ------- numpy.ndarray Numpy array with dimensions `k` by 2. """ index = random.sample(xrange(0, len(X)), k) return X[index, :]
[docs]def findClosestCentroids(X, centroids): """ Returns a numpy array containing the index of the closest centroid for each point in X. Parameters ---------- X : numpy.ndarray A numpy array with 2 columns. centroids : numpy.ndarray A numpy array with 2 columns. Returns ------- index : numpy.ndarray A numpy array with dimensions n * 1, where n is the number of rows in `X`. For each row i in `index`, index[i] is in [0, k) where k is the number of rows in `centroids`. """ index = np.array([np.argmin([ - y_k, x_i - y_k) for y_k in centroids]) for x_i in X]) return index
[docs]def computeCentroids(X, index, k): """ Finds the centroids for the data given the index of the closest centroid for each data point. Parameters ---------- X : numpy.ndarray A numpy array with dimensions n * 2 for some integer n. index : numpy.ndarray A numpy array with dimensions n * 1 that describes the closest centroid to each point in `X`. k : int Describes the number of centroids. k - 1 is the maximum value that appears in `index`. Returns ------- centroids : numpy.ndarray A numpy array with dimensions `k` * 2. Notes ----- The centroids are computed by taking the mean of each group of points in `X` with the same index value. For i in [0, k), centroids[i] is the mean of all data points X[j] where index[j] is i. """ centroids = np.zeros((k, 2)) for i in range(0, k): idx = np.where(index == i)[0] if len(idx) != 0: data = X[idx, :] centroids[i, :] = np.mean(data, 0) return centroids
[docs]def kmeans(X, k, initial_centroids=None, draw=False, output=False, source=None): """Runs kmeans until clusters stop moving. Parameters ---------- X : numpy.ndarray A numpy array with 2 columns. k : int Describes the number of centroids. initial_centroids : numpy.ndarray, optional A numpy array with initial centroids to run the algorithm. This array has with dimensions `k` * 2. If not provided, algorithm is initialized with random centroids from the data `X`. draw : boolean, optional Describes whether the data is to be plotted (data must have 2 or less dimensions). The default is False. output : boolean, optional Describes whether debug info is to be printed (the default is False). Info includes current number of iterations and number of changed points over time. Returns ------- centroids : numpy.ndarray Numpy array with learned centroids (dimensions are `k` * 2). index : numpy.ndarray Numpy array with dimensions n * 1, where n is the number of rows in `X`. Each value describes the closest centroid to each data point in `X`. num_iters : int Describes the number of iterations taken to run kmeans. """ num_iters = 0 # Use initial centroids if provided if initial_centroids is not None: centroids = initial_centroids # Set initial centroids to random else: centroids = initializeCentroids(X, k) previous = centroids index = np.zeros((len(X), 1)) previous_index = np.zeros((len(X), 1)) while True: index = findClosestCentroids(X, centroids) # If no points have been reassigned, the centroids will not move and we # are done if np.array_equal(index, previous_index): break # Print number of points reassigned if num_iters != 0 and output: print np.count_nonzero(index - previous_index),\ " data points changed color" previous_index = index if draw: plotKMeans(X, centroids, previous, index, source) previous = centroids centroids = computeCentroids(X, index, k) num_iters += 1 if output: print "Num iters: ", num_iters return [centroids, index, num_iters]
[docs]def initializeDataCenter(centroid, scale, n): """ Initialize n points with a normal distribution and scale around a centroid. Parameters ---------- centroid : numpy.ndarray Numpy array with dimensions 1 * 2. scale : int Describes the scale for the distribution. n : int Describes the number of points to make. Returns ------- X : numpy.ndarray A numpy array with dimensions `n` * 2. """ X = np.random.normal(centroid, scale=scale, size=(n, 2)) return X
[docs]def initializeData(n, k, scale, low, high): """ Initialize n points around k random centroids each with a normal distribution and scale. Parameters ---------- n : int Describes the numbe of points to make around each centroid. k : int Describes the number of centroids. scale : int Describes the scale for the distribution. low : int The lower bound (inclusive) for a centroid. high : int The upper bound (exclusive) for a centroid. Returns ------- X : numpy.ndarray A numpy array with dimensions (`n` * `k`) * 2. """ centroids = initialize(k, low, high) for i in range(0, len(centroids)): if i == 0: X = initializeDataCenter(centroids[i], scale, n) else: X = np.vstack((X, initializeDataCenter(centroids[i], scale, n))) return X
def _plotData(X, index, source): = dict(x=X[:, 0], y=X[:, 1], index=index)
[docs]def plotKMeans(X, centroids, previous, index, source): """Plots the data and centroids. This function plots the data with the current centroids and shows the movement of the centroids. Parameters ---------- X : numpy.ndarray A numpy array with 2 columns. centroids : numpy.ndarray A numpy array with 2 columns. previous : numpy.ndarray A numpy array with 2 columns and the same number of rows as `centroids`. index : numpy.ndarray A numpy array with 1 column. source : list List of ColumnDataSource """ source, source_centroids, segments = source _plotData(X, index, source) = dict(x=centroids[:, 0], y=centroids[:, 1],) = dict(x0=previous[:,0], y0=previous[:,1], x1=centroids[:,0], y1=centroids[:,1])
[docs]def init_plot(figsize=(1000, 500)): """Initializes the plot. Parameters ---------- figsize : tuple, optional A tuple containing the width and height of the plot (the default is (1000, 800)). """ source = ColumnDataSource(dict( x=[], y=[], index=[] )) centroids = ColumnDataSource(dict( x=[], y=[] )) segments = ColumnDataSource(dict( x0=[], y0=[], x1=[], y1=[] )) p = figure(plot_width=figsize[0], plot_height=figsize[1], tools="xpan,xwheel_zoom,xbox_zoom,reset", x_axis_type=None, y_axis_location="right") # p.x_range.follow = "end" # p.x_range.follow_interval = 100 # p.x_range.range_padding = 0 mapper = LinearColorMapper(palette=Magma[256])'x', y='y', alpha=0.2, line_width=3, color={"field": "index", "transform": mapper}, source=source)'x', y='y', alpha=0.2, size=20, color='black', source=centroids) p.segment(x0='x0', y0='y0', x1='x1', y1='y1', line_width=3, color='blue', source=segments) session = push_session(curdoc()) curdoc().add_root(p) return source, centroids, segments
[docs]def evaluate_error(X, centroids, index): """Returns the mean squared error. Parameters ---------- X : numpy.ndarray A numpy array with 2 columns. centroids : numpy.ndarray A numpy array with 2 columns. index : numpy.ndarray A numpy array with 1 column. Returns ------- float The mean squared error. Notes ----- The mean squared error is calculated as the average squared distance of each point from the closest centroid. """ s = 0 for i in range(0, len(X)): centroid_index = index[i] s +=[i] - centroids[centroid_index], X[i] - centroids[centroid_index]) return float(s) / X.shape[0]