Spaces:

InosLihka
/

rhythm_env

Running

InosLihka Claude Sonnet 4.6 commited on 4 days ago

Commit

0a15ab5

1 Parent(s): c67f463

env: enrich observation with history, anomalies, and discovery bonus

Five environment-level changes to make personality discovery learnable:

1. step_history (last 7 steps) added to RhythmObservation — agent now has
the raw action/reward/delta trajectory needed to detect profile anomalies
across steps, not just the current snapshot

2. Per-meter anomaly signals in reward_breakdown — each step computes
actual_delta minus expected_delta (neutral-profile baseline after
time-of-day + vitality factor), giving the agent a direct fingerprint
of the hidden modifier (e.g. +0.06 vitality_anomaly on DEEP_WORK = workaholic)

3. First-class delta fields on RhythmObservation (vitality_delta, etc.) and
last_action — no longer buried in reward_breakdown dict

4. Discovery bonus (15%) added to _grade_episode — rewards profile-adapted
strategy in steps 14–27 (second half of week); introvert avoids social,
extrovert embraces it, workaholic front-loads work. Without this, the
grader rewarded generic meter management and ignored personality inference.

5. Profile assignment decoupled from seed — uses scrambled RNG
(seed ^ 0xA3C5F729) so models cannot memorize seed%3 = profile patterns
during training; explicit profile= kwarg still overrides for eval

Verified: SOCIALIZE vitality_anomaly is -0.096 for introvert, +0.038 for
extrovert, 0.000 for workaholic — clear per-step personality fingerprint.
Discovery bonus gap: 0.797 vs 0.587 for introvert adapting vs not adapting.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (3) hide show

__init__.py +2 -1
models.py +40 -4
server/rhythm_environment.py +131 -13

__init__.py CHANGED Viewed

@@ -13,12 +13,13 @@ a 7-day week with hidden personality profiles.
 """
 from .client import RhythmEnv
-from .models import ActionType, RhythmAction, RhythmObservation, RhythmState
 __all__ = [
     "RhythmEnv",
     "RhythmAction",
     "RhythmObservation",
     "RhythmState",
     "ActionType",
 ]

 """
 from .client import RhythmEnv
+from .models import ActionType, RhythmAction, RhythmObservation, RhythmState, StepRecord
 __all__ = [
     "RhythmEnv",
     "RhythmAction",
     "RhythmObservation",
     "RhythmState",
+    "StepRecord",
     "ActionType",
 ]

models.py CHANGED Viewed

@@ -15,10 +15,10 @@ a 7-day week with hidden personality profiles.
 from __future__ import annotations
 from enum import Enum
-from typing import Dict, Optional
 from openenv.core.env_server import Action, Observation, State
-from pydantic import Field
 class ActionType(str, Enum):
@@ -46,12 +46,37 @@ class RhythmAction(Action):
     action_type: ActionType
 class RhythmObservation(Observation):
     """
     Observation returned to the agent each step.
-    The agent sees all 5 meters and temporal context, but NOT the hidden
-    personality profile or reward weight decomposition.
     """
     timestep: int = 0
@@ -68,6 +93,17 @@ class RhythmObservation(Observation):
     done: bool = False
     reward_breakdown: Dict[str, float] = Field(default_factory=dict)
 class RhythmState(State):
     """

 from __future__ import annotations
 from enum import Enum
+from typing import Dict, List, Optional
 from openenv.core.env_server import Action, Observation, State
+from pydantic import BaseModel, Field
 class ActionType(str, Enum):
     action_type: ActionType
+class StepRecord(BaseModel):
+    """
+    Record of one completed step included in step_history.
+    Contains the action taken, the reward received, and per-meter deltas.
+    The agent uses this history to detect personality anomalies over time.
+    """
+    step: int
+    action: str
+    reward: float
+    vitality_delta: float = 0.0
+    cognition_delta: float = 0.0
+    progress_delta: float = 0.0
+    serenity_delta: float = 0.0
+    connection_delta: float = 0.0
 class RhythmObservation(Observation):
     """
     Observation returned to the agent each step.
+    The agent sees all 5 meters, temporal context, last-step deltas,
+    anomaly signals (actual vs expected meter changes), and a rolling
+    history of the last 7 steps. The hidden personality profile and
+    reward weight decomposition are NOT included.
+    The step_history and *_anomaly fields in reward_breakdown together
+    give the agent everything it needs to infer the hidden profile:
+    - step_history: raw action/reward/delta trajectory for pattern matching
+    - *_anomaly: how much each meter deviated from neutral-profile expectation
     """
     timestep: int = 0
     done: bool = False
     reward_breakdown: Dict[str, float] = Field(default_factory=dict)
+    # Last step's per-meter deltas as first-class fields (not just buried in reward_breakdown)
+    vitality_delta: float = 0.0
+    cognition_delta: float = 0.0
+    progress_delta: float = 0.0
+    serenity_delta: float = 0.0
+    connection_delta: float = 0.0
+    last_action: Optional[str] = None
+    # Rolling history of the last HISTORY_LENGTH steps
+    step_history: List[StepRecord] = Field(default_factory=list)
 class RhythmState(State):
     """

server/rhythm_environment.py CHANGED Viewed

@@ -13,6 +13,16 @@ profiles secretly control how actions affect meters and how reward is
 computed. The agent must discover these hidden dynamics through experience.
 1 episode = 1 week, 1 step = 1 time slot (4 per day), 28 steps total.
 """
 import random
@@ -24,9 +34,9 @@ from openenv.core.env_server import Environment
 from openenv.core.env_server.types import EnvironmentMetadata
 try:
-    from ..models import ActionType, RhythmAction, RhythmObservation, RhythmState
 except (ImportError, ModuleNotFoundError):
-    from models import ActionType, RhythmAction, RhythmObservation, RhythmState
 # ---------------------------------------------------------------------------
 # Constants
@@ -42,6 +52,7 @@ EVENT_PROBABILITY = 0.08
 CRITICAL_THRESHOLD = 0.1
 CRITICAL_PENALTY = -0.3
 REWARD_SCALE = 15.0
 # ---------------------------------------------------------------------------
 # Action-Effect Matrix (base deltas per action on each meter)
@@ -174,6 +185,7 @@ PROFILE_MAP: Dict[str, Dict[str, Any]] = {p["name"]: p for p in PROFILES}
 # Social actions for modifier checks
 SOCIAL_ACTIONS = {"family_time", "socialize"}
 IDLE_ACTIONS = {"me_time", "binge_watch", "sleep"}
 class RhythmEnvironment(Environment):
@@ -184,6 +196,15 @@ class RhythmEnvironment(Environment):
     Connection) across a 7-day week. Hidden personality profiles secretly
     control how actions affect meters and how reward is computed. The agent
     must discover these hidden dynamics through experience.
     """
     SUPPORTS_CONCURRENT_SESSIONS: bool = True
@@ -204,6 +225,7 @@ class RhythmEnvironment(Environment):
         self._crash_count: int = 0
         self._total_reward: float = 0.0
         self._recent_actions: list = []
     def get_metadata(self) -> EnvironmentMetadata:
         return EnvironmentMetadata(
@@ -213,7 +235,7 @@ class RhythmEnvironment(Environment):
                 "where an agent balances 5 life meters across a 7-day week "
                 "with hidden personality profiles."
             ),
-            version="0.2.0",
         )
     # ------------------------------------------------------------------
@@ -234,12 +256,15 @@ class RhythmEnvironment(Environment):
         self._rng = random.Random(effective_seed)
-        # Profile selection: explicit kwarg or seed-based
         profile_name = kwargs.get("profile")
         if profile_name and profile_name in PROFILE_MAP:
             self._profile = deepcopy(PROFILE_MAP[profile_name])
         else:
-            profile_index = effective_seed % len(PROFILES)
             self._profile = deepcopy(PROFILES[profile_index])
         # Initialize meters from profile defaults
@@ -255,6 +280,7 @@ class RhythmEnvironment(Environment):
         self._crash_count = 0
         self._total_reward = 0.0
         self._recent_actions = []
         self._state = RhythmState(
             episode_id=episode_id or str(uuid4()),
@@ -282,6 +308,9 @@ class RhythmEnvironment(Environment):
         timeout_s: Optional[float] = None,
         **kwargs: Any,
     ) -> RhythmObservation:
         slot = self._timestep % SLOTS_PER_DAY
         day = self._timestep // SLOTS_PER_DAY
         action_name = action.action_type.value
@@ -306,6 +335,11 @@ class RhythmEnvironment(Environment):
         if action_name != "sleep":
             effects = self._apply_time_multipliers(effects, slot)
         # --- 4. Apply profile modifiers ---
         effects = self._apply_profile_modifiers(effects, action_name, slot)
@@ -314,6 +348,10 @@ class RhythmEnvironment(Environment):
         for meter in METERS:
             if meter != "vitality" and effects[meter] > 0:
                 effects[meter] *= vitality_factor
         # --- 6. Apply passive decays ---
         self._apply_passive_decays()
@@ -349,9 +387,14 @@ class RhythmEnvironment(Environment):
         done = self._timestep >= MAX_STEPS
         # --- 12. Build reward breakdown ---
         reward_breakdown: Dict[str, float] = {}
         for meter in METERS:
             reward_breakdown[f"{meter}_delta"] = round(deltas[meter], 4)
         if active_event:
             reward_breakdown["event"] = 1.0
@@ -372,11 +415,27 @@ class RhythmEnvironment(Environment):
         self._state.connection = round(self._connection, 4)
         self._state.active_event = active_event
         return self._make_observation(
             reward=reward,
             done=done,
             active_event=active_event,
             reward_breakdown=reward_breakdown,
         )
     # ------------------------------------------------------------------
@@ -490,7 +549,7 @@ class RhythmEnvironment(Environment):
         # Work vitality recovery: workaholic gets vitality from productive work
         wvr = profile.get("work_vitality_recovery", 0.0)
-        if wvr > 0 and action_name in ("deep_work", "learn", "admin_work"):
             effects["vitality"] += wvr
         # Low serenity amplification (stress spiral)
@@ -519,16 +578,26 @@ class RhythmEnvironment(Environment):
         return reward * REWARD_SCALE
     def _grade_episode(self) -> float:
-        """Compute final episode score in [0, 1]."""
         meters = {m: getattr(self, f"_{m}") for m in METERS}
-        # 1. Meter balance (0.30): high mean, low variance
         values = list(meters.values())
         mean_meter = sum(values) / len(values)
         variance = sum((v - mean_meter) ** 2 for v in values) / len(values)
         balance_score = max(0.0, mean_meter - variance)
-        # 2. No crashes (0.25): fraction of steps without critical meters
         steps = max(self._timestep, 1)
         crash_free_ratio = 1.0 - (self._crash_count / (steps * len(METERS)))
@@ -538,16 +607,40 @@ class RhythmEnvironment(Environment):
         # 4. Connection maintained (0.15)
         connection_score = self._connection
-        # 5. Efficiency (0.10): normalized average reward
         avg_reward = self._total_reward / steps
         efficiency_score = max(0.0, min(1.0, (avg_reward + 1.0) / 2.0))
         score = (
-            0.30 * balance_score
-            + 0.25 * crash_free_ratio
             + 0.20 * progress_score
             + 0.15 * connection_score
-            + 0.10 * efficiency_score
         )
         return max(0.0, min(1.0, score))
@@ -557,8 +650,24 @@ class RhythmEnvironment(Environment):
         done: bool,
         active_event: Optional[str],
         reward_breakdown: Optional[Dict[str, float]] = None,
     ) -> RhythmObservation:
         """Build the observation returned to the agent (hides profile)."""
         return RhythmObservation(
             timestep=self._timestep,
             day=self._timestep // SLOTS_PER_DAY,
@@ -573,4 +682,13 @@ class RhythmEnvironment(Environment):
             reward_breakdown=reward_breakdown or {},
             reward=reward,
             done=done,
         )

 computed. The agent must discover these hidden dynamics through experience.
 1 episode = 1 week, 1 step = 1 time slot (4 per day), 28 steps total.
+Key design principles for learnability:
+  - step_history: last 7 steps of (action, reward, deltas) are included
+    in every observation so the agent can detect personality anomalies
+  - *_anomaly fields: per-meter deviation from neutral-profile expectation,
+    giving a direct fingerprint of the hidden profile each step
+  - discovery_bonus: 15% of final grade rewards profile-adapted strategy
+    in the second half of the week (steps 14–27)
+  - Profile assignment uses a scrambled seed to prevent memorization
+    of seed → profile mappings during training
 """
 import random
 from openenv.core.env_server.types import EnvironmentMetadata
 try:
+    from ..models import ActionType, RhythmAction, RhythmObservation, RhythmState, StepRecord
 except (ImportError, ModuleNotFoundError):
+    from models import ActionType, RhythmAction, RhythmObservation, RhythmState, StepRecord
 # ---------------------------------------------------------------------------
 # Constants
 CRITICAL_THRESHOLD = 0.1
 CRITICAL_PENALTY = -0.3
 REWARD_SCALE = 15.0
+HISTORY_LENGTH = 7  # number of past steps included in every observation
 # ---------------------------------------------------------------------------
 # Action-Effect Matrix (base deltas per action on each meter)
 # Social actions for modifier checks
 SOCIAL_ACTIONS = {"family_time", "socialize"}
 IDLE_ACTIONS = {"me_time", "binge_watch", "sleep"}
+WORK_ACTIONS = {"deep_work", "learn", "admin_work"}
 class RhythmEnvironment(Environment):
     Connection) across a 7-day week. Hidden personality profiles secretly
     control how actions affect meters and how reward is computed. The agent
     must discover these hidden dynamics through experience.
+    Every observation includes:
+      - Current meter values and temporal context
+      - Last step's per-meter deltas as first-class fields
+      - Anomaly signals: actual delta minus neutral-profile expectation
+      - Rolling step_history (last 7 steps) with actions, rewards, deltas
+    The final grade rewards profile-appropriate strategy in the second half
+    of the week (discovery_bonus, 15% of score).
     """
     SUPPORTS_CONCURRENT_SESSIONS: bool = True
         self._crash_count: int = 0
         self._total_reward: float = 0.0
         self._recent_actions: list = []
+        self._step_history: list = []
     def get_metadata(self) -> EnvironmentMetadata:
         return EnvironmentMetadata(
                 "where an agent balances 5 life meters across a 7-day week "
                 "with hidden personality profiles."
             ),
+            version="0.3.0",
         )
     # ------------------------------------------------------------------
         self._rng = random.Random(effective_seed)
+        # Profile selection: explicit kwarg overrides; otherwise use scrambled seed.
+        # Scrambling decouples profile from episode dynamics (events, etc.) so the
+        # model cannot memorize seed → profile patterns during training.
         profile_name = kwargs.get("profile")
         if profile_name and profile_name in PROFILE_MAP:
             self._profile = deepcopy(PROFILE_MAP[profile_name])
         else:
+            profile_rng = random.Random(effective_seed ^ 0xA3C5F729)
+            profile_index = profile_rng.randint(0, len(PROFILES) - 1)
             self._profile = deepcopy(PROFILES[profile_index])
         # Initialize meters from profile defaults
         self._crash_count = 0
         self._total_reward = 0.0
         self._recent_actions = []
+        self._step_history = []
         self._state = RhythmState(
             episode_id=episode_id or str(uuid4()),
         timeout_s: Optional[float] = None,
         **kwargs: Any,
     ) -> RhythmObservation:
+        # Save step number before incrementing (used for history record)
+        current_step = self._timestep
         slot = self._timestep % SLOTS_PER_DAY
         day = self._timestep // SLOTS_PER_DAY
         action_name = action.action_type.value
         if action_name != "sleep":
             effects = self._apply_time_multipliers(effects, slot)
+        # Snapshot expected effects here — after time/dampening but BEFORE profile
+        # modifiers. The anomaly = actual_delta - expected gives the agent a direct
+        # per-step fingerprint of the hidden profile modifier.
+        expected_no_profile = dict(effects)
         # --- 4. Apply profile modifiers ---
         effects = self._apply_profile_modifiers(effects, action_name, slot)
         for meter in METERS:
             if meter != "vitality" and effects[meter] > 0:
                 effects[meter] *= vitality_factor
+        # Apply same vitality factor to expected for fair anomaly comparison
+        for meter in METERS:
+            if meter != "vitality" and expected_no_profile[meter] > 0:
+                expected_no_profile[meter] *= vitality_factor
         # --- 6. Apply passive decays ---
         self._apply_passive_decays()
         done = self._timestep >= MAX_STEPS
         # --- 12. Build reward breakdown ---
+        # Includes: per-meter deltas, per-meter anomalies (actual - expected),
+        # event flag, and final_score on the last step.
         reward_breakdown: Dict[str, float] = {}
         for meter in METERS:
             reward_breakdown[f"{meter}_delta"] = round(deltas[meter], 4)
+            reward_breakdown[f"{meter}_anomaly"] = round(
+                deltas[meter] - expected_no_profile[meter], 4
+            )
         if active_event:
             reward_breakdown["event"] = 1.0
         self._state.connection = round(self._connection, 4)
         self._state.active_event = active_event
+        # --- 15. Append completed step to rolling history ---
+        self._step_history.append({
+            "step": current_step,
+            "action": action_name,
+            "reward": reward,
+            "vitality_delta": round(deltas["vitality"], 4),
+            "cognition_delta": round(deltas["cognition"], 4),
+            "progress_delta": round(deltas["progress"], 4),
+            "serenity_delta": round(deltas["serenity"], 4),
+            "connection_delta": round(deltas["connection"], 4),
+        })
+        if len(self._step_history) > HISTORY_LENGTH:
+            self._step_history.pop(0)
         return self._make_observation(
             reward=reward,
             done=done,
             active_event=active_event,
             reward_breakdown=reward_breakdown,
+            deltas=deltas,
+            last_action=action_name,
         )
     # ------------------------------------------------------------------
         # Work vitality recovery: workaholic gets vitality from productive work
         wvr = profile.get("work_vitality_recovery", 0.0)
+        if wvr > 0 and action_name in WORK_ACTIONS:
             effects["vitality"] += wvr
         # Low serenity amplification (stress spiral)
         return reward * REWARD_SCALE
     def _grade_episode(self) -> float:
+        """
+        Compute final episode score in [0, 1].
+        Scoring breakdown:
+          0.25 — meter balance (high mean, low variance)
+          0.20 — crash-free ratio (no critical meter drops)
+          0.20 — progress made
+          0.15 — connection maintained
+          0.05 — efficiency (average reward)
+          0.15 — discovery bonus (profile-adapted strategy in second half)
+        """
         meters = {m: getattr(self, f"_{m}") for m in METERS}
+        # 1. Meter balance (0.25): high mean, low variance
         values = list(meters.values())
         mean_meter = sum(values) / len(values)
         variance = sum((v - mean_meter) ** 2 for v in values) / len(values)
         balance_score = max(0.0, mean_meter - variance)
+        # 2. No crashes (0.20): fraction of steps without critical meters
         steps = max(self._timestep, 1)
         crash_free_ratio = 1.0 - (self._crash_count / (steps * len(METERS)))
         # 4. Connection maintained (0.15)
         connection_score = self._connection
+        # 5. Efficiency (0.05): normalized average reward
         avg_reward = self._total_reward / steps
         efficiency_score = max(0.0, min(1.0, (avg_reward + 1.0) / 2.0))
+        # 6. Discovery bonus (0.15): did the agent adapt its strategy to the
+        #    hidden profile in the second half of the week (steps 14–27)?
+        #    This is the only component that directly rewards personality discovery.
+        second_half = self._recent_actions[14:]
+        if len(second_half) > 0:
+            profile_name = self._profile["name"]
+            if profile_name == "introvert_morning":
+                # Introvert should minimise social actions
+                social_frac = sum(1 for a in second_half if a in SOCIAL_ACTIONS) / len(second_half)
+                discovery_score = max(0.0, 1.0 - social_frac * 2.5)
+            elif profile_name == "extrovert_night_owl":
+                # Extrovert should embrace social actions
+                social_frac = sum(1 for a in second_half if a in SOCIAL_ACTIONS) / len(second_half)
+                discovery_score = min(1.0, social_frac * 2.5)
+            elif profile_name == "workaholic_stoic":
+                # Workaholic should front-load work actions
+                work_frac = sum(1 for a in second_half if a in WORK_ACTIONS) / len(second_half)
+                discovery_score = min(1.0, work_frac * 1.5)
+            else:
+                discovery_score = 0.5
+        else:
+            discovery_score = 0.5
         score = (
+            0.25 * balance_score
+            + 0.20 * crash_free_ratio
             + 0.20 * progress_score
             + 0.15 * connection_score
+            + 0.05 * efficiency_score
+            + 0.15 * discovery_score
         )
         return max(0.0, min(1.0, score))
         done: bool,
         active_event: Optional[str],
         reward_breakdown: Optional[Dict[str, float]] = None,
+        deltas: Optional[Dict[str, float]] = None,
+        last_action: Optional[str] = None,
     ) -> RhythmObservation:
         """Build the observation returned to the agent (hides profile)."""
+        step_records = [
+            StepRecord(
+                step=h["step"],
+                action=h["action"],
+                reward=h["reward"],
+                vitality_delta=h["vitality_delta"],
+                cognition_delta=h["cognition_delta"],
+                progress_delta=h["progress_delta"],
+                serenity_delta=h["serenity_delta"],
+                connection_delta=h["connection_delta"],
+            )
+            for h in self._step_history
+        ]
         return RhythmObservation(
             timestep=self._timestep,
             day=self._timestep // SLOTS_PER_DAY,
             reward_breakdown=reward_breakdown or {},
             reward=reward,
             done=done,
+            # First-class delta fields (from this step; zero on reset)
+            vitality_delta=round(deltas["vitality"], 4) if deltas else 0.0,
+            cognition_delta=round(deltas["cognition"], 4) if deltas else 0.0,
+            progress_delta=round(deltas["progress"], 4) if deltas else 0.0,
+            serenity_delta=round(deltas["serenity"], 4) if deltas else 0.0,
+            connection_delta=round(deltas["connection"], 4) if deltas else 0.0,
+            last_action=last_action,
+            # Rolling history of the last HISTORY_LENGTH completed steps
+            step_history=step_records,
         )