Gridworld distance: restrict to physically realistic transitions (#12)

AdamGleave · web-flow · commit 89dcc7512905 · 2020-02-01T14:45:56.000-08:00
* Divergence heatmap: label axes with reward type

* Gridworld divergence: restrict to physically realistic for consistency with PointMass

* Hardcode axis labels so script works without TeX
diff --git a/src/evaluating_rewards/analysis/gridworld_heatmap.py b/src/evaluating_rewards/analysis/gridworld_heatmap.py
@@ -114,11 +114,9 @@ def _make_transitions(
     transitions[high_action, states[idx < n - 1], states[idx > 0]] = 1
 
 
-def build_mdp(state_action_reward: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
-    """Create transition matrix for deterministic gridworld and reshape reward."""
-    xlen, ylen, na = state_action_reward.shape
+def build_transitions(xlen: int, ylen: int, na: int) -> np.ndarray:
+    """Create transition matrix for deterministic gridworld."""
     ns = xlen * ylen
-
     transitions = np.zeros((na, ns, ns))
     transitions[Actions.STAY.value, :, :] = np.eye(ns, ns)
     states = np.arange(ns)
@@ -127,8 +125,16 @@ def build_mdp(state_action_reward: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
     _make_transitions(transitions, Actions.LEFT.value, Actions.RIGHT.value, states, ys, ylen)
     _make_transitions(transitions, Actions.DOWN.value, Actions.UP.value, states, xs, xlen)
 
+    return transitions
+
+
+def build_reward(state_action_reward: np.ndarray) -> np.ndarray:
+    """Reshape reward and fill in NaNs."""
+    xlen, ylen, na = state_action_reward.shape
+    ns = xlen * ylen
     reward = state_action_reward.copy()
     reward = reward.reshape(ns, na)
+
     # We use NaN for transitions that would go outside the gridworld.
     # But in above transition dynamics these are equivalent to stay, so rewrite.
     mask = np.isnan(reward)
@@ -137,7 +143,7 @@ def build_mdp(state_action_reward: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
     reward[mask] = stay_tiled[mask]
     assert np.isfinite(reward).all()
 
-    return transitions, reward
+    return reward
 
 
 def _no_op_iter(*args, **kwargs):
@@ -147,7 +153,8 @@ def _no_op_iter(*args, **kwargs):
 
 def compute_qvalues(state_action_reward: np.ndarray, discount: float) -> np.ndarray:
     """Computes the Q-values of `state_action_reward` under deterministic dynamics."""
-    transitions, reward = build_mdp(state_action_reward)
+    transitions = build_transitions(*state_action_reward.shape)
+    reward = build_reward(state_action_reward)
 
     # TODO(adam): remove this workaround once GH pymdptoolbox #32 merged.
     with mock.patch("mdptoolbox.mdp.ValueIteration._boundIter", new=_no_op_iter):
diff --git a/src/evaluating_rewards/analysis/latex/figsymbols.sty b/src/evaluating_rewards/analysis/latex/figsymbols.sty
@@ -22,4 +22,7 @@
 \newcommand{\sparsepenalty}{\texttt{Penalty}}
 \newcommand{\centergoal}{\texttt{Center}}
 \newcommand{\dirtpath}{\texttt{Path}}
-\newcommand{\cliffwalk}{\texttt{Cliff}}
+\newcommand{\cliffwalk}{\texttt{Cliff}}
+
+\newcommand{\srcreward}{R_S}
+\newcommand{\targetreward}{R_T}
diff --git a/src/evaluating_rewards/analysis/plot_gridworld_divergence.py b/src/evaluating_rewards/analysis/plot_gridworld_divergence.py
@@ -25,7 +25,7 @@
 import sacred
 
 from evaluating_rewards import serialize, tabular
-from evaluating_rewards.analysis import gridworld_rewards, stylesheets, visualize
+from evaluating_rewards.analysis import gridworld_heatmap, gridworld_rewards, stylesheets, visualize
 from evaluating_rewards.scripts import script_utils
 
 plot_gridworld_divergence_ex = sacred.Experiment("plot_gridworld_divergence")
@@ -119,6 +119,32 @@ def make_reward(cfg: Dict[str, np.ndarray], discount: float) -> np.ndarray:
     return tabular.shape(state_reward, potential, discount)
 
 
+def direct_divergence(source: np.ndarray, target: np.ndarray, xlen: int, ylen: int) -> float:
+    """Computes direct divergence between `source` and `target`.
+
+    Args:
+        source: the source reward.
+        target: the target reward.
+        xlen: width of gridworld.
+        ylen: height of gridworld.
+
+    Returns:
+        Direct divergence of `source` to `target`, under squared-error metric and uniform
+        random transition dataset. Specifically, dataset generated by sampling state s and
+        action a uniformly at random, and then deterministically computing the next state s'.
+        (We could include physically unattainable transitions, but this would be inconsistent
+        with the PointMass experiments, and is not possible in most environments.)
+    """
+    ns, na, ns2 = source.shape
+    assert ns == xlen * ylen
+    assert ns == ns2
+    transitions = gridworld_heatmap.build_transitions(xlen, ylen, na).transpose((1, 0, 2))
+    # Zero-out any physically unrealistic rewards in both functions
+    source = source * transitions
+    target = target * transitions
+    return tabular.direct_sq_divergence(source, target)
+
+
 def compute_divergence(reward_cfg: Dict[str, Any], discount: float) -> pd.Series:
     """Compute divergence for each pair of rewards in `reward_cfg`."""
     rewards = {name: make_reward(cfg, discount) for name, cfg in reward_cfg.items()}
@@ -130,7 +156,8 @@ def compute_divergence(reward_cfg: Dict[str, Any], discount: float) -> pd.Series
             closest_reward = tabular.closest_reward_em(
                 src_reward, target_reward, n_iter=1000, discount=discount
             )
-            div = tabular.direct_sq_divergence(closest_reward, target_reward)
+            xlen, ylen = reward_cfg[src_name]["state_reward"].shape
+            div = direct_divergence(closest_reward, target_reward, xlen, ylen)
             divergence[target_name][src_name] = div
     divergence = pd.DataFrame(divergence)
     divergence = divergence.stack()
diff --git a/src/evaluating_rewards/analysis/stylesheets.py b/src/evaluating_rewards/analysis/stylesheets.py
@@ -40,7 +40,7 @@
         "figure.figsize": (3.25, 2.4375),
         "figure.subplot.top": 0.99,
         "figure.subplot.bottom": 0.16,
-        "figure.subplot.left": 0.16,
+        "figure.subplot.left": 0.17,
         "figure.subplot.right": 0.91,
     },
     "gridworld-heatmap": {
diff --git a/src/evaluating_rewards/analysis/visualize.py b/src/evaluating_rewards/analysis/visualize.py
@@ -181,6 +181,7 @@ def _heatmap_reformat(series, preserve_order):
 
 def comparison_heatmap(
     vals: pd.Series,
+    ax: plt.Axes,
     log: bool = True,
     fmt: Callable[[float], str] = short_e,
     cbar_kws: Optional[Dict[str, Any]] = None,
@@ -224,7 +225,12 @@ def comparison_heatmap(
     if robust:
         flat = data.values.flatten()
         kwargs["vmin"], kwargs["vmax"] = np.quantile(flat, [0.25, 0.75])
-    sns.heatmap(data, annot=annot, fmt="s", cmap=cmap, cbar_kws=cbar_kws, mask=mask, **kwargs)
+    sns.heatmap(
+        data, annot=annot, fmt="s", cmap=cmap, cbar_kws=cbar_kws, mask=mask, ax=ax, **kwargs
+    )
+
+    ax.set_xlabel(r"Target $R_T$")
+    ax.set_ylabel(r"Source $R_S$")
 
 
 def median_seeds(series: pd.Series) -> pd.Series:
@@ -388,7 +394,7 @@ def compact_heatmaps(
     for name, matching in masks.items():
         fig, ax = plt.subplots(1, 1, squeeze=True)
         match_mask = compute_mask(loss, matching)
-        comparison_heatmap(loss, fmt=fmt, preserve_order=True, mask=match_mask, ax=ax, **kwargs)
+        comparison_heatmap(loss, ax=ax, fmt=fmt, preserve_order=True, mask=match_mask, **kwargs)
         # make room for multi-line xlabels
         after_plot()
         figs[name] = fig