Meta-SGD

Now, we define a class called MetaSGD where we implement the Meta-SGD algorithm. In the __init__ method, we'll initialize all the necessary variables. Then, we define our sigmoid activation function. After this, we define our train function:

class MetaSGD(object):

We define the __init__ method and initialize all necessary variables:

    def __init__(self):
        
        #initialize number of tasks i.e number of tasks we need in each batch of tasks
        self.num_tasks = 2
        
        #number of samples i.e number of shots -number of data points (k) we need to have in each task
        self.num_samples = 10

        #number of epochs i.e training iterations
        self.epochs = 10000
        
        #hyperparameter for the outer loop (outer gradient update) i.e meta optimization
        self.beta = 0.0001
        
        #randomly initialize our model parameter theta
        self.theta = np.random.normal(size=50).reshape(50, 1)
         
        #randomly initialize alpha with same shape as theta
        self.alpha = np.random.normal(size=50).reshape(50, 1)

We define our sigmoid activation function:

    def sigmoid(self,a):
        return 1.0 / (1 + np.exp(-a))

Now, let's start training:

    def train(self):

For the number of epochs:

        for e in range(self.epochs): 

            self.theta_ = []

For the i task in a batch of tasks:

           for i in range(self.num_tasks):

We sample k data points and prepare our train set:

                XTrain, YTrain = sample_points(self.num_samples)

Then, we predict the value of y using a single layer network:

                a = np.matmul(XTrain, self.theta)

                YHat = self.sigmoid(a)

We compute the loss and calculate gradients:

                #since we're performing classification, we use cross entropy loss as our loss function
                loss = ((np.matmul(-YTrain.T, np.log(YHat)) - np.matmul((1 -YTrain.T), np.log(1 - YHat)))/self.num_samples)[0][0]
                
                #minimize the loss by calculating gradients
                gradient = np.matmul(XTrain.T, (YHat - YTrain)) / self.num_samples

After that, we update the gradients and find the optimal parameter for each of the tasks:

                self.theta_.append(self.theta - (np.multiply(self.alpha,gradient)))

We initialize the meta gradients:

            meta_gradient = np.zeros(self.theta.shape)

            for i in range(self.num_tasks):

We sample k data points and prepare our test set for meta training :

                XTest, YTest = sample_points(10)

We then predict the value of y:

                a = np.matmul(XTest, self.theta_[i])
                
                YPred = self.sigmoid(a)

We compute the meta gradients:

                meta_gradient += np.matmul(XTest.T, (YPred - YTest)) / self.num_samples

Now, we update our model parameter, theta, and alpha:

            self.theta = self.theta-self.beta*meta_gradient/self.num_tasks
                       
            self.alpha = self.alpha-self.beta*meta_gradient/self.num_tasks

We print the loss for every 1,000 epochs:

            if e%1000==0:
                print "Epoch {}: Loss {}
".format(e,loss) 
                print 'Updated Model Parameter Theta
'
                print 'Sampling Next Batch of Tasks 
'
                print '---------------------------------
'

The complete code for MetaSGD is given as follows:

class MetaSGD(object):
    def __init__(self):
        
        #initialize number of tasks i.e number of tasks we need in each batch of tasks
        self.num_tasks = 2
        
        #number of samples i.e number of shots -number of data points (k) we need to have in each task
        self.num_samples = 10

        #number of epochs i.e training iterations
        self.epochs = 10000
        
        #hyperparameter for the inner loop (inner gradient update)
        self.alpha = 0.0001
        
        #hyperparameter for the outer loop (outer gradient update) i.e meta optimization
        self.beta = 0.0001
       
        #randomly initialize our model parameter theta
        self.theta = np.random.normal(size=50).reshape(50, 1)
         
        #randomly initialize alpha with same shape as theta
        self.alpha = np.random.normal(size=50).reshape(50, 1)
      
    #define our sigmoid activation function 
    def sigmoid(self,a):
        return 1.0 / (1 + np.exp(-a))
    
    
    #now let's get to the interesting part i.e training :P
    def train(self):
        
        #for the number of epochs,
        for e in range(self.epochs): 
            
            self.theta_ = []
            
            #for task i in batch of tasks
            for i in range(self.num_tasks):
               
                #sample k data points and prepare our train set
                XTrain, YTrain = sample_points(self.num_samples)
                
                a = np.matmul(XTrain, self.theta)

                YHat = self.sigmoid(a)

                #since we're performing classification, we use cross entropy loss as our loss function
                loss = ((np.matmul(-YTrain.T, np.log(YHat)) - np.matmul((1 -YTrain.T), np.log(1 - YHat)))/self.num_samples)[0][0]
                
                #minimize the loss by calculating gradients
                gradient = np.matmul(XTrain.T, (YHat - YTrain)) / self.num_samples

                #update the gradients and find the optimal parameter theta' for each of tasks
                self.theta_.append(self.theta - (np.multiply(self.alpha,gradient)))
                
     
            #initialize meta gradients
            meta_gradient = np.zeros(self.theta.shape)
                        
            for i in range(self.num_tasks):
            
                #sample k data points and prepare our test set for meta training
                XTest, YTest = sample_points(10)

                #predict the value of y
                a = np.matmul(XTest, self.theta_[i])
                
                YPred = self.sigmoid(a)
                           
                #compute meta gradients
                meta_gradient += np.matmul(XTest.T, (YPred - YTest)) / self.num_samples

            #update our randomly initialized model parameter theta with the meta gradients
            self.theta = self.theta-self.beta*meta_gradient/self.num_tasks
                       
            #update our randomly initialized hyperparameter alpha with the meta gradients
            self.alpha = self.alpha-self.beta*meta_gradient/self.num_tasks
                                       
            if e%1000==0:
                print "Epoch {}: Loss {}
".format(e,loss) 
                print 'Updated Model Parameter Theta
'
                print 'Sampling Next Batch of Tasks 
'
                print '---------------------------------
'

We create an instance of our MetaSGD class:

model = MetaSGD()

Let's start training the model:

model.train()

You can see how the loss minimizes through various epochs:

Epoch 0: Loss 2.22523195333

Updated Model Parameter Theta

Sampling Next Batch of Tasks 

---------------------------------

Epoch 1000: Loss 1.951785305709

Updated Model Parameter Theta

Sampling Next Batch of Tasks 

---------------------------------

Epoch 2000: Loss 1.47382270343

Updated Model Parameter Theta

Sampling Next Batch of Tasks 

---------------------------------

Epoch 3000: Loss 1.07296354822

Updated Model Parameter Theta

Sampling Next Batch of Tasks 

---------------------------------

Table of Contents for Meta-SGD

Create new playlist

Sign In

Sign Up

Table of Contents for
Meta-SGD