Source code for or_suite.agents.oil_discovery.grid_search

import numpy as np
import random
import sys
from .. import Agent


[docs]class grid_searchAgent(Agent):
    """
    Agent that uses a bisection-method heuristic algorithm to the find location with 
    the highest probability of discovering oil. 

    Methods:
        reset() : resets bounds of agent to reflect upper and lower bounds of metric space
        update_config() : (UNIMPLEMENTED)
        update_obs(obs, action, reward, newObs, timestep, info) : record reward of current midpoint 
            or move bounds in direction of higher reward
        pick_action(state, step) : move agent to midpoint or perturb current dimension

    Attributes:
        epLen: (int) number of time steps to run the experiment for
        dim: (int) dimension of metric space for agent and environment
        upper: (float list list) matrix containing upper bounds of agent at each step in dimension
        lower: (float list list) matrix contianing lower bounds of agent at each step in dimension
        perturb_estimates: (float list list) matrix containing estimated rewards from perturbation in each dimension
        midpoint_value: (float list) list containing midpoint of agent at each step
        dim_index: (int list) list looping through various dimensions during perturbation
        select_midpoint: (bool list) list recording whether to take midpoint or perturb at given step
    """

[docs]    def __init__(self, epLen, dim=1):
        """
        Args:
            epLen: (int) number of time steps to run the experiment for
            dim: (int) dimension of metric space for agent and environment
        """

        # Saving parameters like the epLen, dimension of the space
        self.epLen = epLen
        self.dim = dim

        # Current bounds for the upper and lower estimates on where the maximum value is
        self.upper = np.ones((epLen, dim))
        self.lower = np.zeros((epLen, dim))

        # Estimates obtained for the "perturbed" values
        self.perturb_estimates = np.zeros((epLen, 2*dim))
        self.midpoint_value = np.zeros(epLen)
        self.dim_index = [0 for _ in range(self.epLen)]

        # Indicator of "where" we are in the process, i.e. selecting the midpoint, doing small perturbations, etc
        self.select_midpoint = [True for _ in range(self.epLen)]

[docs]    def reset(self):
        # Resets upper to array of ones, lower to array of zeros
        self.upper = np.ones((self.epLen, self.dim))
        self.lower = np.zeros((self.epLen, self.dim))

[docs]    def update_obs(self, obs, action, reward, newObs, timestep, info):
        """
        If no perturbations needed, update reward to be value at midpoint. 
        Else, adjust upper or lower bound in the direction of higher 
        reward as determined by the perturbation step. Agent loops across
        each dimension separately, and updates estimated midpoint after each
        loop.
        """
        # If we selected the midpoint in prev step
        if self.select_midpoint[timestep]:
            # Store value of midpoint estimate
            self.midpoint_value[timestep] = reward
            # Switch to sampling the purturbed values
            self.select_midpoint[timestep] = False
        else:
            self.perturb_estimates[timestep, self.dim_index[timestep]] = reward
            self.dim_index[timestep] += 1

            if self.dim_index[timestep] > 0 and self.dim_index[timestep] % 2 == 0:
                # corresponding index of upper/lower bound matrix given self.dim_indx[timestep]
                bound_index = int(self.dim_index[timestep]/2 - 1)
                midpoint = (self.upper[timestep, bound_index] +
                            self.lower[timestep, bound_index]) / 2

                # compare pert forward with pert backwards in dimension of timestep
                pert_f = self.dim_index[timestep]-2
                pert_b = self.dim_index[timestep]-1
                if self.perturb_estimates[timestep, pert_f] > self.perturb_estimates[timestep, pert_b]:
                    # if lower perturbation has higher reward, move lower bound up
                    self.lower[timestep, bound_index] = midpoint
                else:
                    self.upper[timestep, bound_index] = midpoint

                # reset dim_index once perturbations completed in every dimension
                if self.dim_index[timestep] == 2*self.dim:
                    self.dim_index[timestep] = 0
                self.select_midpoint[timestep] = True

        return

[docs]    def update_policy(self, k):
        '''Update internal policy based upon records.

        Not used, because a greedy algorithm does not have a policy.'''

        # Greedy algorithm does not update policy
        pass

[docs]    def pick_action(self, state, step):
        """ 
        If upper and lower bounds are updated based on perturbed values, move agent to midpoint.
        Else, perturb dimension by factor equal to half the distance from each bound to midpoint. 
        """

        # action taken at step h is used to maximize the step h+1 oil function
        if step+1 < self.epLen:
            next_step = step+1
        # if last step, move agent to random location
        else:
            return np.random.rand(self.dim)

        if self.select_midpoint[step]:
            action = (self.upper[next_step] + self.lower[next_step]) / 2
        else:
            # Gets the dimension index, mods it by 2 to get a 0,1 value, takes (-1) to the power
            # so the sign switches from positive and negative
            p_location = np.zeros(self.dim)
            p_location[int(np.floor(self.dim_index[step] / 2))] = 1
            perturbation = np.zeros(
                self.dim) + (-1)**(np.mod(self.dim_index[step], 2))*p_location
            # perturb distance of 1/4 * width of dimension
            action = (self.upper[next_step] + self.lower[next_step]) / 2 + \
                (perturbation*(self.upper[next_step] -
                 self.lower[next_step])/(4))

        return action