nntoolbox.py

# NNTOOLBOX.PY
#
# This file contains a collection of function for training
# neural networks.
#
# date: July 28, 2019
# author: Benjamin Riggan
#
import numpy as np
import time

def nnsetup(architecture):
	nn = {} # define empty dictionary
	nn['size'] = architecture
	nn['n'] = len(nn['size'])
	#print(nn['size'])
	#print(nn['n'])

	nn['activation_function'] = 'tanh_opt' # Activation functions of hidden layers: 'sigm' (sigmoid) or 'tanh_opt' (optimal tanh).
	nn['learning_rate'] = 2 # earning rate Note: typically needs to be lower when using 'sigm' activation function and non-normalized inputs.
	nn['momentum'] = 0.9 # Momentum
	nn['scaling_learningRate'] = 1 # scaling factor for the learning rate (each epoch)
	nn['weightPenaltyL2'] = 0.00001 # L2 regularization
	nn['nonSparsityPenalty'] = 0 # non sparsity penalty
	nn['sparsityTarget'] = 0.05 # sparsity target
	nn['inputZeroMaskedFraction'] = 0 # used for denoising autoencoders
	nn['dropoutFraction'] = 0 # dropout level
	nn['testing'] =0
	nn['output'] = 'sigm' # output unit 'sigm' (=logistic), 'softmax', and 'linear'
	nn['W'] = [None] * (nn['n']-1) # empty list of weight matrices
	nn['vW'] = [None] * (nn['n']-1) # empty list of momentum terms
	nn['verbose'] = True # change to True if printed output is desired

	for i in range(1,nn['n']):
		# weights and weight momentum
		nn['W'][i-1] = ((np.random.rand(int(nn['size'][i]), int(nn['size'][i-1])+1) - 0.5) * 2.0 * np.sqrt(6.0 / (nn['size'][i] + nn['size'][i-1])))
		nn['W'][i-1][:,0] = 0 # init bias to 0
		nn['vW'][i-1] = np.zeros(nn['W'][i-1].shape)

		# average activations (for use with sparsity)
		nn['p'] = np.zeros((1, int(nn['size'][i])))

	return nn

# stacked autoencoder setup
def saesetup(architecture):
	sae = {'n': len(architecture)-1} # number of autoencoders
	for i in range(1,len(architecture)):
		key = 'ae{}'.format(i-1)
		sae[key] = nnsetup([architecture[i-1], architecture[i], architecture[i-1]])
	return sae

# nnff performs a feedforward pass
def nnff(nn, x, y=None):
	n = nn['n']
	m = x.shape[0]

	x = np.hstack((np.ones((m,1)), x))
	nn['a'] = [None] * n
	nn['a'][0] = x

	if nn['dropoutFraction'] > 0:
		nn['dropoutMask'] = [None] * n

	# feed forward passnn['a'].append(np.dot(nn['a'][i-1] * nn['W'][i-1].T)
	for i in range(1,n-1):
		if nn['activation_function'] == 'plin':
			nn['a'][i] = np.dot(nn['a'][i-1],nn['W'][i-1].T)
		elif nn['activation_function'] == 'sigm':
			nn['a'][i] = 1. / (1. + np.exp(- np.dot(nn['a'][i-1],nn['W'][i-1].T)))
		elif nn['activation_function'] == 'tanh_opt':
			nn['a'][i] = np.tanh(np.dot(nn['a'][i-1],nn['W'][i-1].T))

		# dropout
		if nn['dropoutFraction'] > 0:
			if nn['testing']:
				nn['a'][i] = nn['a'][i] * (1 - nn['dropoutFraction'])
			else:
				nn['dropoutMask'][i] = np.random.rand(nn['a'][i].shape[0],nn['a'][i].shape[1])>nn['dropoutFraction']
				nn['a'][i] = nn['a'][i] * nn['dropoutMask'][i]

		# Add the bias term
		nn['a'][i] = np.hstack((np.ones((m,1)), nn['a'][i]))

	if nn['output'] == 'plin':
		nn['a'][n-1] = np.dot(nn['a'][n-2],nn['W'][n-2].T)
	elif nn['output'] == 'sigm':
		nn['a'][n-1] = 1. / (1. + np.exp(-np.dot(nn['a'][n-2],nn['W'][n-2].T)))
	elif nn['output'] == 'tanh_opt':
		nn['a'][n-1] = np.tanh(np.dot(nn['a'][n-2],nn['W'][n-2].T))

	# error and loss
	nn['e'] = y - nn['a'][n-1]
	nn['L'] = 0.5 * np.sum(nn['e']**2) / m

	return nn

def nnbp(nn):
	n = nn['n']
	d = [None] * n

	if nn['output'] == 'sigm':
		d[n-1] = -nn['e'] * (nn['a'][n-1] * (1- nn['a'][n-1]))
	elif nn['output'] == 'tanh_opt':
		d[n-1] = - nn['e'] * (1-nn['a'][n-1]**2)
	elif nn['output'] == 'plin':
		d[n-1] = - nn['e']

	for i in range(n-2,0,-1):
		# derivative of activation function
		if nn['activation_function'] == 'plin':
			d_act = 1
		elif nn['activation_function'] == 'sigm':
			d_act = nn['a'][i] * (1 - nn['a'][i])
		elif nn['activation_function'] == 'tanh_opt':
			d_act = (1 - nn['a'][i]**2)

		# Backpropagate first derivative
		if i+2 == n:
			d[i] = np.dot(d[i+1], nn['W'][i]) * d_act
		else:
			d[i] = np.dot(d[i+1][:,1:], nn['W'][i]) * d_act

		if nn['dropoutFraction'] > 0:
			d[i] = d[i] * np.hstack((np.ones((d[i].shape[0], 1)), nn['dropoutMask'][i]))
	nn['dW'] = [None] * n
	for i in range(n-1):
		if i+2 == n:
			nn['dW'][i] = np.dot(d[i+1].T, nn['a'][i]) / d[i+1].shape[0]
		else:
			nn['dW'][i] = np.dot(d[i+1][:,1:].T, nn['a'][i]) / d[i+1].shape[0]

	return nn

# nnapplygrads updates weights and biases with calculated gradients
def nnapplygrads(nn):
	for i in range(0,nn['n']-1):
		if nn['weightPenaltyL2']>0:
			dW = nn['dW'][i] + nn['weightPenaltyL2'] * np.hstack((np.zeros((nn['W'][i].shape[0],1)), nn['W'][i][:,1:]))
		else:
			dW = nn['dW'][i]

		dW = nn['learning_rate'] * dW

		if nn['momentum'] > 0:
			nn['vW'][i] = nn['momentum'] * nn['vW'][i] + dW
			dW = nn['vW'][i]
		nn['W'][i] = nn['W'][i] - dW

	return nn
def nneval(nn, loss, train_x, train_y, val_x=None, val_y=None):
	nn['testing'] = 1
	nn = nnff(nn, train_x, train_y)
	loss['train']['e'].append(nn['L'])

	if val_x is not None and val_y is not None:
		nn = nnff(nn, val_x, val_y)
		loss['val']['e'].append(nn['L'])
	nn['testing'] = 0

	return loss

def nntrain(nn, train_x, train_y, opts, val_x=None, val_y=None):
	# trains a neural net
	loss={'train': {'e': [], 'e_frac': []} , 'val': {'e': [], 'e_frac': []} }
	opts['validation'] = 0
	if val_x is not None and val_y is not None:
		opts['validation'] = 1

	m = train_x.shape[0]
	batchsize = opts['batchsize']
	numepochs = opts['numepochs']
	eta0 = nn['learning_rate']
	numbatches = np.int(np.floor(m / batchsize))
	if nn['verbose'] is True:
		print('numbatches = {}'.format(numbatches))

	L = np.zeros((numepochs*numbatches,1));
	n = 0

	for i in range(numepochs):
		start = time.time()
		kk = np.random.permutation(m)
		#kk = np.arange(m)
		for l in range(numbatches):
			batch_x = train_x[ kk[l * batchsize : (l+1) * batchsize], :]
			batch_y = train_y[ kk[l * batchsize : (l+1) * batchsize], :]

			#print('feed forward...')
			nn = nnff(nn, batch_x, batch_y)
			#print('backprop...')
			nn = nnbp(nn)
			#print('applygrads...')
			nn = nnapplygrads(nn);

			L[n] = nn['L']
			n = n + 1
		elapsed = time.time() - start
		if opts['validation'] == 1:
			loss = nneval(nn, loss, train_x, train_y, val_x, val_y)
			str_perf = '; Full-batch train mse = {}, val mse = {}'.format(loss['train']['e'][-1], loss['val']['e'][-1])
		else:
			#print('eval...')
			loss = nneval(nn, loss, train_x, train_y)
			str_perf = '; Full-batch train mse = {}'.format(loss['train']['e'][-1])
		if nn['verbose'] is True:
			print('epoch {} / {}. Took {} seconds. Mini-batch mean squared error on training set is {} {}'.format(i+1,opts['numepochs'],elapsed,np.mean(L[n-numbatches:(n-1)]), str_perf))
		if 'epsilon' in opts:
			if opts['epsilon'] > 0 and i>0:
				absdiff = np.abs(loss['train']['e'][-1] - loss['train']['e'][-2])
				if absdiff < opts['epsilon']:
					if nn['verbose'] is True:
						print('Network converged: {}'.format(absdiff))
	return nn

def saetrain(sae, x, opts, val_x=None):
	for i in range(sae['n']):
		print('Training AE {} / {}'.format(i, sae['n']-1))
		key = 'ae{}'.format(i)
		if val_x is None:
			sae[key] = nntrain(sae[key], x, x, opts)
			t = nnff(sae[key], x, x)
			x = t['a'][1]
			# remove bias
			x = x[:,1:]
		else:
			sae[key] = nntrain(sae[key], x, x, opts, val_x, val_x)
			t = nnff(sae[key], x, x)
			x = t['a'][1]
			# remove bias
			x = x[:,1:]
			t = nnff(sae[key], val_x, val_x)
			val_x = t['a'][1]
			# remove bias
			val_x = val_x[:,1:]
	return sae