Source code for IoTPy.modules.ML.LinearRegression.linear_regression

import numpy as np
import math
from bokeh.models import ColumnDataSource
from bokeh.plotting import curdoc, figure
from bokeh.client import push_session

TIME_SLEEP = 0.000000001


[docs]def train_sgd(X, y, alpha, w=None):
    """Trains a linear regression model using stochastic gradient descent.

    Parameters
    ----------
    X : numpy.ndarray
        Numpy array of data
    y : numpy.ndarray
        Numpy array of outputs. Dimensions are n * 1, where n is the number of rows
        in `X`.
    alpha : float
        Describes the learning rate.
    w : numpy.ndarray, optional
        The initial w vector (the default is zero).

    Returns
    -------
    w : numpy.ndarray
        Trained vector with dimensions (m + 1) * 1, where m is the number of
        columns in `X`.

    """

    X_b = np.hstack((np.ones((X.shape[0], 1)), X))

    previous_error = -1
    error = -1

    stop = False
    num_iters = 0

    if w is None:
        w = np.zeros((x.shape[1] + 1, 1))

    while not stop:
        for i in range(0, len(X)):
            w = w - alpha / len(X) * (np.dot(np.transpose(w),
                                      X_b[i].reshape(X_b.shape[1], 1)) -
                                      y[i]) * X_b[i].reshape(X_b.shape[1], 1)

            error = evaluate_error(X, y, w)
            if previous_error == -1:
                previous_error = error
            elif (math.fabs(error - previous_error) < 0.01 * previous_error and
                  num_iters > 10000):
                stop = True
                break

            previous_error = error
            num_iters += 1

    return w


[docs]def train(X, y):
    """Trains a linear regression model using linear algebra.

    Parameters
    ----------
    X : numpy.ndarray
        Numpy array of data
    y : numpy.ndarray
        Numpy array of outputs. Dimensions are n * 1, where n is the number of rows
        in `X`.

    Returns
    -------
    w : numpy.ndarray
        Trained vector with dimensions (m + 1) * 1, where m is the number of
        columns in `X`.

    """

    # Add bias term
    X_b = np.hstack((np.ones((X.shape[0], 1)), X))

    # Compute pseudo-inverse
    X_inverse = (np.linalg.inv(np.transpose(X_b).dot(X_b)).dot(
                 np.transpose(X_b)))

    # Compute w
    w = X_inverse.dot(y)

    return w


# Plot data
[docs]def plot(X, y, w, source, step_size, max_window_size):
    """Plot X data, the actual y output, and the prediction line.

    Parameters
    ----------
    X : numpy.ndarray
        Numpy array of data with 1 column.
    y : numpy.ndarray
        Numpy array of outputs. Dimensions are n * 1, where n is the number of
        rows in `X`.
    w : numpy.ndarray
        Numpy array with dimensions 2 * 1.
    source : list
        List of ColumnDataSource
    step_size : int
        The step size
    max_window_size : int
        The max window size

    """

    X_b = np.hstack((np.ones((X.shape[0], 1)), X))


    y_predict = X_b.dot(w)

    source, predict = source

    for i in range(-step_size, 0):
        x_value = X[i].tolist()[0]
        y_value = y[i].tolist()[0]

        new_data = dict(
            x=[x_value],
            y=[y_value],
        )

        source.stream(new_data, max_window_size)

    predict.data = dict(x=X.flatten().tolist(), y=y_predict.flatten().tolist())


[docs]def init_plot(figsize=(1000, 500)):
    """Initializes the plot.

    Parameters
    ----------
    figsize : tuple, optional
        A tuple containing the width and height of the plot (the default is
        (1000, 800)).

    """

    source = ColumnDataSource(dict(
        x=[], y=[]
    ))

    predict = ColumnDataSource(dict(
        x=[], y=[]
    ))

    p = figure(plot_width=figsize[0], plot_height=figsize[1], tools="xpan,xwheel_zoom,xbox_zoom,reset", x_axis_type=None, y_axis_location="right")
    p.x_range.follow = "end"
    p.x_range.follow_interval = 100
    p.x_range.range_padding = 0

    p.circle(x='x', y='y', alpha=0.2, line_width=3, color='navy', source=source)
    p.line(x='x', y='y', alpha=0.2, line_width=3, color='navy', source=predict)

    session = push_session(curdoc())

    curdoc().add_root(p)

    session.show()

    return source, predict

[docs]def evaluate_error(X, y, w):
    """Returns the mean squared error.

    X : numpy.ndarray
        Numpy array of data.
    y : numpy.ndarray
        Numpy array of outputs. Dimensions are n * 1, where n is the number of
        rows in `X`.
    w : numpy.ndarray
        Numpy array with dimensions (m + 1) * 1, where m is the number of
        columns in `X`.

    Returns
    -------
    float
        The mean squared error

    """

    X_b = np.hstack((np.ones((X.shape[0], 1)), X))
    y_predict = X_b.dot(w)
    dist = (y - y_predict) ** 2

    return float(np.sum(dist)) / X.shape[0]


[docs]def predict(X, w):
    """Returns the prediction for one data point.

    Parameters
    ----------
    X : numpy.ndarray
        Numpy array of data
    w : numpy.ndarray
        Numpy array with dimensions (m + 1) * 1, where m is the number of
        columns in `X`.

    Returns
    -------
    float
        The mean squared error

    """
    X_b = np.hstack((np.ones((X.shape[0], 1)), X))
    return X_b.dot(w)