import numpy as np
import random
import sys
from .. import Agent
[docs]class grid_searchAgent(Agent):
"""
Agent that uses a bisection-method heuristic algorithm to the find location with
the highest probability of discovering oil.
Methods:
reset() : resets bounds of agent to reflect upper and lower bounds of metric space
update_config() : (UNIMPLEMENTED)
update_obs(obs, action, reward, newObs, timestep, info) : record reward of current midpoint
or move bounds in direction of higher reward
pick_action(state, step) : move agent to midpoint or perturb current dimension
Attributes:
epLen: (int) number of time steps to run the experiment for
dim: (int) dimension of metric space for agent and environment
upper: (float list list) matrix containing upper bounds of agent at each step in dimension
lower: (float list list) matrix contianing lower bounds of agent at each step in dimension
perturb_estimates: (float list list) matrix containing estimated rewards from perturbation in each dimension
midpoint_value: (float list) list containing midpoint of agent at each step
dim_index: (int list) list looping through various dimensions during perturbation
select_midpoint: (bool list) list recording whether to take midpoint or perturb at given step
"""
[docs] def __init__(self, epLen, dim=1):
"""
Args:
epLen: (int) number of time steps to run the experiment for
dim: (int) dimension of metric space for agent and environment
"""
# Saving parameters like the epLen, dimension of the space
self.epLen = epLen
self.dim = dim
# Current bounds for the upper and lower estimates on where the maximum value is
self.upper = np.ones((epLen, dim))
self.lower = np.zeros((epLen, dim))
# Estimates obtained for the "perturbed" values
self.perturb_estimates = np.zeros((epLen, 2*dim))
self.midpoint_value = np.zeros(epLen)
self.dim_index = [0 for _ in range(self.epLen)]
# Indicator of "where" we are in the process, i.e. selecting the midpoint, doing small perturbations, etc
self.select_midpoint = [True for _ in range(self.epLen)]
[docs] def reset(self):
# Resets upper to array of ones, lower to array of zeros
self.upper = np.ones((self.epLen, self.dim))
self.lower = np.zeros((self.epLen, self.dim))
[docs] def update_obs(self, obs, action, reward, newObs, timestep, info):
"""
If no perturbations needed, update reward to be value at midpoint.
Else, adjust upper or lower bound in the direction of higher
reward as determined by the perturbation step. Agent loops across
each dimension separately, and updates estimated midpoint after each
loop.
"""
# If we selected the midpoint in prev step
if self.select_midpoint[timestep]:
# Store value of midpoint estimate
self.midpoint_value[timestep] = reward
# Switch to sampling the purturbed values
self.select_midpoint[timestep] = False
else:
self.perturb_estimates[timestep, self.dim_index[timestep]] = reward
self.dim_index[timestep] += 1
if self.dim_index[timestep] > 0 and self.dim_index[timestep] % 2 == 0:
# corresponding index of upper/lower bound matrix given self.dim_indx[timestep]
bound_index = int(self.dim_index[timestep]/2 - 1)
midpoint = (self.upper[timestep, bound_index] +
self.lower[timestep, bound_index]) / 2
# compare pert forward with pert backwards in dimension of timestep
pert_f = self.dim_index[timestep]-2
pert_b = self.dim_index[timestep]-1
if self.perturb_estimates[timestep, pert_f] > self.perturb_estimates[timestep, pert_b]:
# if lower perturbation has higher reward, move lower bound up
self.lower[timestep, bound_index] = midpoint
else:
self.upper[timestep, bound_index] = midpoint
# reset dim_index once perturbations completed in every dimension
if self.dim_index[timestep] == 2*self.dim:
self.dim_index[timestep] = 0
self.select_midpoint[timestep] = True
return
[docs] def update_policy(self, k):
'''Update internal policy based upon records.
Not used, because a greedy algorithm does not have a policy.'''
# Greedy algorithm does not update policy
pass
[docs] def pick_action(self, state, step):
"""
If upper and lower bounds are updated based on perturbed values, move agent to midpoint.
Else, perturb dimension by factor equal to half the distance from each bound to midpoint.
"""
# action taken at step h is used to maximize the step h+1 oil function
if step+1 < self.epLen:
next_step = step+1
# if last step, move agent to random location
else:
return np.random.rand(self.dim)
if self.select_midpoint[step]:
action = (self.upper[next_step] + self.lower[next_step]) / 2
else:
# Gets the dimension index, mods it by 2 to get a 0,1 value, takes (-1) to the power
# so the sign switches from positive and negative
p_location = np.zeros(self.dim)
p_location[int(np.floor(self.dim_index[step] / 2))] = 1
perturbation = np.zeros(
self.dim) + (-1)**(np.mod(self.dim_index[step], 2))*p_location
# perturb distance of 1/4 * width of dimension
action = (self.upper[next_step] + self.lower[next_step]) / 2 + \
(perturbation*(self.upper[next_step] -
self.lower[next_step])/(4))
return action