Now, we define a class called MetaSGD where we implement the Meta-SGD algorithm. In the __init__ method, we'll initialize all the necessary variables. Then, we define our sigmoid activation function. After this, we define our train function:
class MetaSGD(object):
We define the __init__ method and initialize all necessary variables:
def __init__(self):
#initialize number of tasks i.e number of tasks we need in each batch of tasks
self.num_tasks = 2
#number of samples i.e number of shots -number of data points (k) we need to have in each task
self.num_samples = 10
#number of epochs i.e training iterations
self.epochs = 10000
#hyperparameter for the outer loop (outer gradient update) i.e meta optimization
self.beta = 0.0001
#randomly initialize our model parameter theta
self.theta = np.random.normal(size=50).reshape(50, 1)
#randomly initialize alpha with same shape as theta
self.alpha = np.random.normal(size=50).reshape(50, 1)
We define our sigmoid activation function:
def sigmoid(self,a):
return 1.0 / (1 + np.exp(-a))
Now, let's start training:
def train(self):
For the number of epochs:
for e in range(self.epochs):
self.theta_ = []
For the i task in a batch of tasks:
for i in range(self.num_tasks):
We sample k data points and prepare our train set:
XTrain, YTrain = sample_points(self.num_samples)
Then, we predict the value of y using a single layer network:
a = np.matmul(XTrain, self.theta)
YHat = self.sigmoid(a)
We compute the loss and calculate gradients:
#since we're performing classification, we use cross entropy loss as our loss function
loss = ((np.matmul(-YTrain.T, np.log(YHat)) - np.matmul((1 -YTrain.T), np.log(1 - YHat)))/self.num_samples)[0][0]
#minimize the loss by calculating gradients
gradient = np.matmul(XTrain.T, (YHat - YTrain)) / self.num_samples
After that, we update the gradients and find the optimal parameter for each of the tasks:
self.theta_.append(self.theta - (np.multiply(self.alpha,gradient)))
We initialize the meta gradients:
meta_gradient = np.zeros(self.theta.shape)
for i in range(self.num_tasks):
We sample k data points and prepare our test set for meta training :
XTest, YTest = sample_points(10)
We then predict the value of y:
a = np.matmul(XTest, self.theta_[i])
YPred = self.sigmoid(a)
We compute the meta gradients:
meta_gradient += np.matmul(XTest.T, (YPred - YTest)) / self.num_samples
Now, we update our model parameter, theta, and alpha:
self.theta = self.theta-self.beta*meta_gradient/self.num_tasks
self.alpha = self.alpha-self.beta*meta_gradient/self.num_tasks
We print the loss for every 1,000 epochs:
if e%1000==0:
print "Epoch {}: Loss {} ".format(e,loss)
print 'Updated Model Parameter Theta '
print 'Sampling Next Batch of Tasks '
print '--------------------------------- '
The complete code for MetaSGD is given as follows:
class MetaSGD(object):
def __init__(self):
#initialize number of tasks i.e number of tasks we need in each batch of tasks
self.num_tasks = 2
#number of samples i.e number of shots -number of data points (k) we need to have in each task
self.num_samples = 10
#number of epochs i.e training iterations
self.epochs = 10000
#hyperparameter for the inner loop (inner gradient update)
self.alpha = 0.0001
#hyperparameter for the outer loop (outer gradient update) i.e meta optimization
self.beta = 0.0001
#randomly initialize our model parameter theta
self.theta = np.random.normal(size=50).reshape(50, 1)
#randomly initialize alpha with same shape as theta
self.alpha = np.random.normal(size=50).reshape(50, 1)
#define our sigmoid activation function
def sigmoid(self,a):
return 1.0 / (1 + np.exp(-a))
#now let's get to the interesting part i.e training :P
def train(self):
#for the number of epochs,
for e in range(self.epochs):
self.theta_ = []
#for task i in batch of tasks
for i in range(self.num_tasks):
#sample k data points and prepare our train set
XTrain, YTrain = sample_points(self.num_samples)
a = np.matmul(XTrain, self.theta)
YHat = self.sigmoid(a)
#since we're performing classification, we use cross entropy loss as our loss function
loss = ((np.matmul(-YTrain.T, np.log(YHat)) - np.matmul((1 -YTrain.T), np.log(1 - YHat)))/self.num_samples)[0][0]
#minimize the loss by calculating gradients
gradient = np.matmul(XTrain.T, (YHat - YTrain)) / self.num_samples
#update the gradients and find the optimal parameter theta' for each of tasks
self.theta_.append(self.theta - (np.multiply(self.alpha,gradient)))
#initialize meta gradients
meta_gradient = np.zeros(self.theta.shape)
for i in range(self.num_tasks):
#sample k data points and prepare our test set for meta training
XTest, YTest = sample_points(10)
#predict the value of y
a = np.matmul(XTest, self.theta_[i])
YPred = self.sigmoid(a)
#compute meta gradients
meta_gradient += np.matmul(XTest.T, (YPred - YTest)) / self.num_samples
#update our randomly initialized model parameter theta with the meta gradients
self.theta = self.theta-self.beta*meta_gradient/self.num_tasks
#update our randomly initialized hyperparameter alpha with the meta gradients
self.alpha = self.alpha-self.beta*meta_gradient/self.num_tasks
if e%1000==0:
print "Epoch {}: Loss {} ".format(e,loss)
print 'Updated Model Parameter Theta '
print 'Sampling Next Batch of Tasks '
print '--------------------------------- '
We create an instance of our MetaSGD class:
model = MetaSGD()
Let's start training the model:
model.train()
You can see how the loss minimizes through various epochs:
Epoch 0: Loss 2.22523195333 Updated Model Parameter Theta Sampling Next Batch of Tasks --------------------------------- Epoch 1000: Loss 1.951785305709 Updated Model Parameter Theta Sampling Next Batch of Tasks --------------------------------- Epoch 2000: Loss 1.47382270343 Updated Model Parameter Theta Sampling Next Batch of Tasks --------------------------------- Epoch 3000: Loss 1.07296354822 Updated Model Parameter Theta Sampling Next Batch of Tasks ---------------------------------