Skip to content

Latest commit

 

History

History
395 lines (262 loc) · 8.42 KB

anomaly_detection_with_autoencoder.md

File metadata and controls

395 lines (262 loc) · 8.42 KB

Note: This is a generated markdown export from the Jupyter notebook file anomaly_detection_with_autoencoder.ipynb. You can also view the notebook with the nbviewer from Jupyter.

# Anomaly detection with an Autoencoder

%matplotlib inline
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing

import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets, metrics, model_selection
digits = datasets.load_digits()

fig, axes = plt.subplots(nrows=1, ncols=10, figsize=(10, 3))
for ax, image, label in zip(axes, digits.images, digits.target):
    ax.set_axis_off()
    ax.imshow(image, cmap=plt.cm.gray_r)
    ax.set_title('%i' % label)

png

target = digits.target
data = digits.images

print("min value: {}".format(np.amin(data)))
print("max value: {}".format(np.amax(data)))
print("shape: {}".format(np.shape(data)))
min value: 0.0
max value: 16.0
shape: (1797, 8, 8)
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    data, target, test_size=0.5)


X_train = X_train.astype('float32') / 16.
X_test = X_test.astype('float32') / 16.


df_train = pd.DataFrame(y_train, columns=['target'])
df_train['type'] = 'train'

df_test = pd.DataFrame(y_test, columns=['target'])
df_test['type'] = 'test'

df_set = df_train.append(df_test)

_ = sns.countplot(x='target', hue='type', data=df_set)     

print('train samples:', len(X_train))
print('test samples', len(X_test))
train samples: 898
test samples 899

png

class Autoencoder(tf.keras.models.Model):
    def __init__(self):
        super(Autoencoder, self).__init__()
        self.encoder = tf.keras.Sequential([
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(16, activation='relu'),
            tf.keras.layers.Dense(8, activation='relu'),
        ])
        self.decoder = tf.keras.Sequential([
            tf.keras.layers.Dense(16, activation='relu'),
            tf.keras.layers.Dense(64, activation='sigmoid'),
            tf.keras.layers.Reshape((8, 8))
        ])

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

autoencoder = Autoencoder()


autoencoder.compile(optimizer='adam', loss='mse')

%%time
history = autoencoder.fit(X_train, X_train,
            epochs=100,
            validation_split = 0.2,
            validation_data=(X_test, X_test),
            verbose=0)
CPU times: user 7.87 s, sys: 1.23 s, total: 9.09 s
Wall time: 6.92 s
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()
loss val_loss epoch
95 0.023893 0.023797 95
96 0.023811 0.023788 96
97 0.023782 0.023722 97
98 0.023789 0.023617 98
99 0.023696 0.023929 99
def plot_loss(history):
  plt.plot(history.history['loss'], label='loss')
  plt.plot(history.history['val_loss'], label='val_loss')
  
  plt.xlabel('Epoch')
  plt.ylabel('Error')
  plt.legend()
  plt.grid(True)

plot_loss(history)

png

reconstructions = autoencoder.predict(digits.images)


fig, axes = plt.subplots(nrows=1, ncols=10, figsize=(10, 3))
for ax, image, label in zip(axes, reconstructions, digits.target):
    ax.set_axis_off()
    ax.imshow(image, cmap=plt.cm.gray_r)
    ax.set_title('%i' % label)

png

reconstruction_error_train = np.mean(tf.keras.losses.mae(autoencoder.predict(X_train), X_train), axis=-1)
reconstruction_error_test = np.mean(tf.keras.losses.mae(autoencoder.predict(X_test), X_test), axis=-1)

df_train = pd.DataFrame(reconstruction_error_train, columns=['reconstruction_error'])
df_train['type'] = 'train'

df_test = pd.DataFrame(reconstruction_error_test, columns=['reconstruction_error'])
df_test['type'] = 'test'

df_set = df_train.append(df_test)


fig, axs = plt.subplots(nrows=2, figsize=(10, 5))
fig.suptitle('Reconstruction error', fontsize=16)

p_threshold = 99
threshold = np.percentile(reconstruction_error_test, p_threshold)

x_max = np.max(reconstruction_error_test) + np.std(reconstruction_error_test)


axs[0].axvline(threshold, color='r', ls='--')
axs[0].set(xlim=(0, x_max))

axs[0].text(0.85, 0.2, 'threshold {:.3f}
(percentile: {})'.format(threshold, p_threshold), 
            horizontalalignment='left', verticalalignment='center', transform=axs[0].transAxes)


axs[1].axvline(threshold, color='r', ls='--')
axs[1].set(xlim=(0, x_max))


_ = sns.kdeplot(data=df_set, x='reconstruction_error' ,hue='type', ax=axs[0])
_ = sns.boxplot(data=df_set, x='reconstruction_error', y='type', orient='h', ax=axs[1])

png

anomalies_index = np.argwhere(reconstruction_error_test > threshold).flatten()

anomalies_x = np.array(X_test)[anomalies_index] 
anomalies_y = np.array(y_test)[anomalies_index] 


fig, axes = plt.subplots(nrows=1, ncols=len(anomalies_x), figsize=(10, 3))
fig.suptitle('Samples with reconstruction error > {:.3f} (percentile: {})'.format(threshold, p_threshold), fontsize=16)

for ax, image, label, in zip(axes, anomalies_x, anomalies_y):
    ax.set_axis_off()
    ax.imshow(image, cmap=plt.cm.gray_r)
    ax.set_title('%i' % label)

png

_ = sns.countplot(x=anomalies_y).set_title('Reconstruction error by target')     

png

flipped_images = np.array([np.transpose(x) for x in digits.images[0:10]])
flipped_images = flipped_images / 16.
flipped_images

reconstruction_error_flipped_images = np.mean(tf.keras.losses.mae(autoencoder.predict(flipped_images), flipped_images), axis=-1) 
is_anomaly = reconstruction_error_flipped_images > threshold
fig, axes = plt.subplots(nrows=1, ncols=len(flipped_images), figsize=(10, 2))
fig.suptitle('Flipped images'.format(threshold, p_threshold), fontsize=16)
for ax, image, anomaly in zip(axes, flipped_images, is_anomaly):
    ax.set_axis_off()
    ax.imshow(image, cmap=plt.cm.gray_r)
    if anomaly:
        ax.set_title('anomaly')

png

pd.DataFrame(reconstruction_error_flipped_images, columns=['reconstruction_error'])
reconstruction_error
0 0.293820
1 0.219904
2 0.339496
3 0.316413
4 0.251248
5 0.287674
6 0.305904
7 0.311851
8 0.344047
9 0.310956