support_vector_machine.py

import numpy as np
import math
import cvxopt

cvxopt.solvers.options['show_progress'] = False

class SVM:
    '''
    Implements a Support Vector Machine with hard margin on linearly separable
    datasets for classification problems
    fields:
    int dim             Dimensionality of the data
    ndarray weights     Array (dim+1 x 1) of the weights
    List data           Array (N x 1) of tuples (x, y) composed of vectors x and results y=f(x)
    List suppvectors    Support vectors of the solution
    '''
    def __init__(self, dim, data = []):
        self.dim = dim
        self.reset(data)

    def reset(self, data):
        '''
        Reset weights and feed a data sample
        '''
        self.weights = np.zeros(self.dim+1)
        for t in data:
            if len(t[0])!=self.dim:
                raise ValueError('Wrong data dimensionality')
        self.data = data
        self.suppvectors = []

    def solve(self):
        '''
        Computes the weights and support vectors with quadratic programming
        '''
        P = []
        for pointrow in self.data:
            row = []
            for pointcol in self.data:
                row.append(pointrow[1]*pointcol[1]*np.dot(pointrow[0], pointcol[0]))
            P.append(row)
        P = cvxopt.matrix(np.array(P))

        q = cvxopt.matrix(np.repeat(-1., len(self.data)))

        G = []
        row = []
        nrow = []
        for point in self.data:
            row.append(point[1])
            nrow.append(-point[1])
        G.append(row)
        G.append(nrow)
        G = cvxopt.matrix(np.concatenate((G, np.negative(np.identity(len(self.data)))), axis=0))

        h = cvxopt.matrix(np.zeros(len(self.data)+2))

        sol = cvxopt.solvers.qp(P, q, G, h)
        alpha = sol['x']
        supp = []
        for i in range(len(self.data)):
            if alpha[i] < 1/(len(self.data)**2):
                alpha[i] = 0
            else:
                supp.append(self.data[i])
        self.suppvectors = supp

        w = np.zeros(self.dim)
        for i in range(len(self.data)):
            w += alpha[i]*self.data[i][1]*np.array(self.data[i][0])

        maxsv = self.data[0]
        if (len(self.suppvectors)>0):
            maxsv = self.suppvectors[0]
            for sv in self.suppvectors:
                if np.linalg.norm(sv[0]) > np.linalg.norm(maxsv[0]):
                    maxsv = sv
        b = maxsv[1] - np.dot(w, maxsv[0])

        self.weights = np.append([b], w, axis=0)

    def hypothesis(self, x):
        '''
        Takes d-dimensional data vector x and computes h(x)
        using the current weights
        '''
        x_adj = [1.0] + x    #adjusted to include 1 at the start
        weighted_sum = np.dot(self.weights, x_adj)    #dot product of w and x
        if weighted_sum==0.0:
            return 0.0
        else:
            return math.copysign(1.0, weighted_sum)        #sign function

    def classify(self, point):
        '''
        Takes as "point" a tuple (x, y) with x a vector and y=f(x)
        and classifies it, returning True if h(x)=f(x) and False if not
        '''
        h = self.hypothesis(point[0])
        return h == point[1]

    def classification_error(self, data):
        '''
        Computes the in-sample error Ein if given self.data as data,
        computes the out-of-sample error Eout if given a dataset generated by f(x) as data
        '''
        g_misclass_points = 0    #counter of newdata points misclassified by g
        for point in data:
            if not self.classify(point):
                g_misclass_points += 1
        #return the fraction of P
        return g_misclass_points / len(data)