InosLihka Claude Sonnet 4.6 commited on
Commit
0a15ab5
Β·
1 Parent(s): c67f463

env: enrich observation with history, anomalies, and discovery bonus

Browse files

Five environment-level changes to make personality discovery learnable:

1. step_history (last 7 steps) added to RhythmObservation β€” agent now has
the raw action/reward/delta trajectory needed to detect profile anomalies
across steps, not just the current snapshot

2. Per-meter anomaly signals in reward_breakdown β€” each step computes
actual_delta minus expected_delta (neutral-profile baseline after
time-of-day + vitality factor), giving the agent a direct fingerprint
of the hidden modifier (e.g. +0.06 vitality_anomaly on DEEP_WORK = workaholic)

3. First-class delta fields on RhythmObservation (vitality_delta, etc.) and
last_action β€” no longer buried in reward_breakdown dict

4. Discovery bonus (15%) added to _grade_episode β€” rewards profile-adapted
strategy in steps 14–27 (second half of week); introvert avoids social,
extrovert embraces it, workaholic front-loads work. Without this, the
grader rewarded generic meter management and ignored personality inference.

5. Profile assignment decoupled from seed β€” uses scrambled RNG
(seed ^ 0xA3C5F729) so models cannot memorize seed%3 = profile patterns
during training; explicit profile= kwarg still overrides for eval

Verified: SOCIALIZE vitality_anomaly is -0.096 for introvert, +0.038 for
extrovert, 0.000 for workaholic β€” clear per-step personality fingerprint.
Discovery bonus gap: 0.797 vs 0.587 for introvert adapting vs not adapting.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (3) hide show
  1. __init__.py +2 -1
  2. models.py +40 -4
  3. server/rhythm_environment.py +131 -13
__init__.py CHANGED
@@ -13,12 +13,13 @@ a 7-day week with hidden personality profiles.
13
  """
14
 
15
  from .client import RhythmEnv
16
- from .models import ActionType, RhythmAction, RhythmObservation, RhythmState
17
 
18
  __all__ = [
19
  "RhythmEnv",
20
  "RhythmAction",
21
  "RhythmObservation",
22
  "RhythmState",
 
23
  "ActionType",
24
  ]
 
13
  """
14
 
15
  from .client import RhythmEnv
16
+ from .models import ActionType, RhythmAction, RhythmObservation, RhythmState, StepRecord
17
 
18
  __all__ = [
19
  "RhythmEnv",
20
  "RhythmAction",
21
  "RhythmObservation",
22
  "RhythmState",
23
+ "StepRecord",
24
  "ActionType",
25
  ]
models.py CHANGED
@@ -15,10 +15,10 @@ a 7-day week with hidden personality profiles.
15
  from __future__ import annotations
16
 
17
  from enum import Enum
18
- from typing import Dict, Optional
19
 
20
  from openenv.core.env_server import Action, Observation, State
21
- from pydantic import Field
22
 
23
 
24
  class ActionType(str, Enum):
@@ -46,12 +46,37 @@ class RhythmAction(Action):
46
  action_type: ActionType
47
 
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  class RhythmObservation(Observation):
50
  """
51
  Observation returned to the agent each step.
52
 
53
- The agent sees all 5 meters and temporal context, but NOT the hidden
54
- personality profile or reward weight decomposition.
 
 
 
 
 
 
 
55
  """
56
 
57
  timestep: int = 0
@@ -68,6 +93,17 @@ class RhythmObservation(Observation):
68
  done: bool = False
69
  reward_breakdown: Dict[str, float] = Field(default_factory=dict)
70
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  class RhythmState(State):
73
  """
 
15
  from __future__ import annotations
16
 
17
  from enum import Enum
18
+ from typing import Dict, List, Optional
19
 
20
  from openenv.core.env_server import Action, Observation, State
21
+ from pydantic import BaseModel, Field
22
 
23
 
24
  class ActionType(str, Enum):
 
46
  action_type: ActionType
47
 
48
 
49
+ class StepRecord(BaseModel):
50
+ """
51
+ Record of one completed step included in step_history.
52
+
53
+ Contains the action taken, the reward received, and per-meter deltas.
54
+ The agent uses this history to detect personality anomalies over time.
55
+ """
56
+
57
+ step: int
58
+ action: str
59
+ reward: float
60
+ vitality_delta: float = 0.0
61
+ cognition_delta: float = 0.0
62
+ progress_delta: float = 0.0
63
+ serenity_delta: float = 0.0
64
+ connection_delta: float = 0.0
65
+
66
+
67
  class RhythmObservation(Observation):
68
  """
69
  Observation returned to the agent each step.
70
 
71
+ The agent sees all 5 meters, temporal context, last-step deltas,
72
+ anomaly signals (actual vs expected meter changes), and a rolling
73
+ history of the last 7 steps. The hidden personality profile and
74
+ reward weight decomposition are NOT included.
75
+
76
+ The step_history and *_anomaly fields in reward_breakdown together
77
+ give the agent everything it needs to infer the hidden profile:
78
+ - step_history: raw action/reward/delta trajectory for pattern matching
79
+ - *_anomaly: how much each meter deviated from neutral-profile expectation
80
  """
81
 
82
  timestep: int = 0
 
93
  done: bool = False
94
  reward_breakdown: Dict[str, float] = Field(default_factory=dict)
95
 
96
+ # Last step's per-meter deltas as first-class fields (not just buried in reward_breakdown)
97
+ vitality_delta: float = 0.0
98
+ cognition_delta: float = 0.0
99
+ progress_delta: float = 0.0
100
+ serenity_delta: float = 0.0
101
+ connection_delta: float = 0.0
102
+ last_action: Optional[str] = None
103
+
104
+ # Rolling history of the last HISTORY_LENGTH steps
105
+ step_history: List[StepRecord] = Field(default_factory=list)
106
+
107
 
108
  class RhythmState(State):
109
  """
server/rhythm_environment.py CHANGED
@@ -13,6 +13,16 @@ profiles secretly control how actions affect meters and how reward is
13
  computed. The agent must discover these hidden dynamics through experience.
14
 
15
  1 episode = 1 week, 1 step = 1 time slot (4 per day), 28 steps total.
 
 
 
 
 
 
 
 
 
 
16
  """
17
 
18
  import random
@@ -24,9 +34,9 @@ from openenv.core.env_server import Environment
24
  from openenv.core.env_server.types import EnvironmentMetadata
25
 
26
  try:
27
- from ..models import ActionType, RhythmAction, RhythmObservation, RhythmState
28
  except (ImportError, ModuleNotFoundError):
29
- from models import ActionType, RhythmAction, RhythmObservation, RhythmState
30
 
31
  # ---------------------------------------------------------------------------
32
  # Constants
@@ -42,6 +52,7 @@ EVENT_PROBABILITY = 0.08
42
  CRITICAL_THRESHOLD = 0.1
43
  CRITICAL_PENALTY = -0.3
44
  REWARD_SCALE = 15.0
 
45
 
46
  # ---------------------------------------------------------------------------
47
  # Action-Effect Matrix (base deltas per action on each meter)
@@ -174,6 +185,7 @@ PROFILE_MAP: Dict[str, Dict[str, Any]] = {p["name"]: p for p in PROFILES}
174
  # Social actions for modifier checks
175
  SOCIAL_ACTIONS = {"family_time", "socialize"}
176
  IDLE_ACTIONS = {"me_time", "binge_watch", "sleep"}
 
177
 
178
 
179
  class RhythmEnvironment(Environment):
@@ -184,6 +196,15 @@ class RhythmEnvironment(Environment):
184
  Connection) across a 7-day week. Hidden personality profiles secretly
185
  control how actions affect meters and how reward is computed. The agent
186
  must discover these hidden dynamics through experience.
 
 
 
 
 
 
 
 
 
187
  """
188
 
189
  SUPPORTS_CONCURRENT_SESSIONS: bool = True
@@ -204,6 +225,7 @@ class RhythmEnvironment(Environment):
204
  self._crash_count: int = 0
205
  self._total_reward: float = 0.0
206
  self._recent_actions: list = []
 
207
 
208
  def get_metadata(self) -> EnvironmentMetadata:
209
  return EnvironmentMetadata(
@@ -213,7 +235,7 @@ class RhythmEnvironment(Environment):
213
  "where an agent balances 5 life meters across a 7-day week "
214
  "with hidden personality profiles."
215
  ),
216
- version="0.2.0",
217
  )
218
 
219
  # ------------------------------------------------------------------
@@ -234,12 +256,15 @@ class RhythmEnvironment(Environment):
234
 
235
  self._rng = random.Random(effective_seed)
236
 
237
- # Profile selection: explicit kwarg or seed-based
 
 
238
  profile_name = kwargs.get("profile")
239
  if profile_name and profile_name in PROFILE_MAP:
240
  self._profile = deepcopy(PROFILE_MAP[profile_name])
241
  else:
242
- profile_index = effective_seed % len(PROFILES)
 
243
  self._profile = deepcopy(PROFILES[profile_index])
244
 
245
  # Initialize meters from profile defaults
@@ -255,6 +280,7 @@ class RhythmEnvironment(Environment):
255
  self._crash_count = 0
256
  self._total_reward = 0.0
257
  self._recent_actions = []
 
258
 
259
  self._state = RhythmState(
260
  episode_id=episode_id or str(uuid4()),
@@ -282,6 +308,9 @@ class RhythmEnvironment(Environment):
282
  timeout_s: Optional[float] = None,
283
  **kwargs: Any,
284
  ) -> RhythmObservation:
 
 
 
285
  slot = self._timestep % SLOTS_PER_DAY
286
  day = self._timestep // SLOTS_PER_DAY
287
  action_name = action.action_type.value
@@ -306,6 +335,11 @@ class RhythmEnvironment(Environment):
306
  if action_name != "sleep":
307
  effects = self._apply_time_multipliers(effects, slot)
308
 
 
 
 
 
 
309
  # --- 4. Apply profile modifiers ---
310
  effects = self._apply_profile_modifiers(effects, action_name, slot)
311
 
@@ -314,6 +348,10 @@ class RhythmEnvironment(Environment):
314
  for meter in METERS:
315
  if meter != "vitality" and effects[meter] > 0:
316
  effects[meter] *= vitality_factor
 
 
 
 
317
 
318
  # --- 6. Apply passive decays ---
319
  self._apply_passive_decays()
@@ -349,9 +387,14 @@ class RhythmEnvironment(Environment):
349
  done = self._timestep >= MAX_STEPS
350
 
351
  # --- 12. Build reward breakdown ---
 
 
352
  reward_breakdown: Dict[str, float] = {}
353
  for meter in METERS:
354
  reward_breakdown[f"{meter}_delta"] = round(deltas[meter], 4)
 
 
 
355
  if active_event:
356
  reward_breakdown["event"] = 1.0
357
 
@@ -372,11 +415,27 @@ class RhythmEnvironment(Environment):
372
  self._state.connection = round(self._connection, 4)
373
  self._state.active_event = active_event
374
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
  return self._make_observation(
376
  reward=reward,
377
  done=done,
378
  active_event=active_event,
379
  reward_breakdown=reward_breakdown,
 
 
380
  )
381
 
382
  # ------------------------------------------------------------------
@@ -490,7 +549,7 @@ class RhythmEnvironment(Environment):
490
 
491
  # Work vitality recovery: workaholic gets vitality from productive work
492
  wvr = profile.get("work_vitality_recovery", 0.0)
493
- if wvr > 0 and action_name in ("deep_work", "learn", "admin_work"):
494
  effects["vitality"] += wvr
495
 
496
  # Low serenity amplification (stress spiral)
@@ -519,16 +578,26 @@ class RhythmEnvironment(Environment):
519
  return reward * REWARD_SCALE
520
 
521
  def _grade_episode(self) -> float:
522
- """Compute final episode score in [0, 1]."""
 
 
 
 
 
 
 
 
 
 
523
  meters = {m: getattr(self, f"_{m}") for m in METERS}
524
 
525
- # 1. Meter balance (0.30): high mean, low variance
526
  values = list(meters.values())
527
  mean_meter = sum(values) / len(values)
528
  variance = sum((v - mean_meter) ** 2 for v in values) / len(values)
529
  balance_score = max(0.0, mean_meter - variance)
530
 
531
- # 2. No crashes (0.25): fraction of steps without critical meters
532
  steps = max(self._timestep, 1)
533
  crash_free_ratio = 1.0 - (self._crash_count / (steps * len(METERS)))
534
 
@@ -538,16 +607,40 @@ class RhythmEnvironment(Environment):
538
  # 4. Connection maintained (0.15)
539
  connection_score = self._connection
540
 
541
- # 5. Efficiency (0.10): normalized average reward
542
  avg_reward = self._total_reward / steps
543
  efficiency_score = max(0.0, min(1.0, (avg_reward + 1.0) / 2.0))
544
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
545
  score = (
546
- 0.30 * balance_score
547
- + 0.25 * crash_free_ratio
548
  + 0.20 * progress_score
549
  + 0.15 * connection_score
550
- + 0.10 * efficiency_score
 
551
  )
552
  return max(0.0, min(1.0, score))
553
 
@@ -557,8 +650,24 @@ class RhythmEnvironment(Environment):
557
  done: bool,
558
  active_event: Optional[str],
559
  reward_breakdown: Optional[Dict[str, float]] = None,
 
 
560
  ) -> RhythmObservation:
561
  """Build the observation returned to the agent (hides profile)."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
562
  return RhythmObservation(
563
  timestep=self._timestep,
564
  day=self._timestep // SLOTS_PER_DAY,
@@ -573,4 +682,13 @@ class RhythmEnvironment(Environment):
573
  reward_breakdown=reward_breakdown or {},
574
  reward=reward,
575
  done=done,
 
 
 
 
 
 
 
 
 
576
  )
 
13
  computed. The agent must discover these hidden dynamics through experience.
14
 
15
  1 episode = 1 week, 1 step = 1 time slot (4 per day), 28 steps total.
16
+
17
+ Key design principles for learnability:
18
+ - step_history: last 7 steps of (action, reward, deltas) are included
19
+ in every observation so the agent can detect personality anomalies
20
+ - *_anomaly fields: per-meter deviation from neutral-profile expectation,
21
+ giving a direct fingerprint of the hidden profile each step
22
+ - discovery_bonus: 15% of final grade rewards profile-adapted strategy
23
+ in the second half of the week (steps 14–27)
24
+ - Profile assignment uses a scrambled seed to prevent memorization
25
+ of seed β†’ profile mappings during training
26
  """
27
 
28
  import random
 
34
  from openenv.core.env_server.types import EnvironmentMetadata
35
 
36
  try:
37
+ from ..models import ActionType, RhythmAction, RhythmObservation, RhythmState, StepRecord
38
  except (ImportError, ModuleNotFoundError):
39
+ from models import ActionType, RhythmAction, RhythmObservation, RhythmState, StepRecord
40
 
41
  # ---------------------------------------------------------------------------
42
  # Constants
 
52
  CRITICAL_THRESHOLD = 0.1
53
  CRITICAL_PENALTY = -0.3
54
  REWARD_SCALE = 15.0
55
+ HISTORY_LENGTH = 7 # number of past steps included in every observation
56
 
57
  # ---------------------------------------------------------------------------
58
  # Action-Effect Matrix (base deltas per action on each meter)
 
185
  # Social actions for modifier checks
186
  SOCIAL_ACTIONS = {"family_time", "socialize"}
187
  IDLE_ACTIONS = {"me_time", "binge_watch", "sleep"}
188
+ WORK_ACTIONS = {"deep_work", "learn", "admin_work"}
189
 
190
 
191
  class RhythmEnvironment(Environment):
 
196
  Connection) across a 7-day week. Hidden personality profiles secretly
197
  control how actions affect meters and how reward is computed. The agent
198
  must discover these hidden dynamics through experience.
199
+
200
+ Every observation includes:
201
+ - Current meter values and temporal context
202
+ - Last step's per-meter deltas as first-class fields
203
+ - Anomaly signals: actual delta minus neutral-profile expectation
204
+ - Rolling step_history (last 7 steps) with actions, rewards, deltas
205
+
206
+ The final grade rewards profile-appropriate strategy in the second half
207
+ of the week (discovery_bonus, 15% of score).
208
  """
209
 
210
  SUPPORTS_CONCURRENT_SESSIONS: bool = True
 
225
  self._crash_count: int = 0
226
  self._total_reward: float = 0.0
227
  self._recent_actions: list = []
228
+ self._step_history: list = []
229
 
230
  def get_metadata(self) -> EnvironmentMetadata:
231
  return EnvironmentMetadata(
 
235
  "where an agent balances 5 life meters across a 7-day week "
236
  "with hidden personality profiles."
237
  ),
238
+ version="0.3.0",
239
  )
240
 
241
  # ------------------------------------------------------------------
 
256
 
257
  self._rng = random.Random(effective_seed)
258
 
259
+ # Profile selection: explicit kwarg overrides; otherwise use scrambled seed.
260
+ # Scrambling decouples profile from episode dynamics (events, etc.) so the
261
+ # model cannot memorize seed β†’ profile patterns during training.
262
  profile_name = kwargs.get("profile")
263
  if profile_name and profile_name in PROFILE_MAP:
264
  self._profile = deepcopy(PROFILE_MAP[profile_name])
265
  else:
266
+ profile_rng = random.Random(effective_seed ^ 0xA3C5F729)
267
+ profile_index = profile_rng.randint(0, len(PROFILES) - 1)
268
  self._profile = deepcopy(PROFILES[profile_index])
269
 
270
  # Initialize meters from profile defaults
 
280
  self._crash_count = 0
281
  self._total_reward = 0.0
282
  self._recent_actions = []
283
+ self._step_history = []
284
 
285
  self._state = RhythmState(
286
  episode_id=episode_id or str(uuid4()),
 
308
  timeout_s: Optional[float] = None,
309
  **kwargs: Any,
310
  ) -> RhythmObservation:
311
+ # Save step number before incrementing (used for history record)
312
+ current_step = self._timestep
313
+
314
  slot = self._timestep % SLOTS_PER_DAY
315
  day = self._timestep // SLOTS_PER_DAY
316
  action_name = action.action_type.value
 
335
  if action_name != "sleep":
336
  effects = self._apply_time_multipliers(effects, slot)
337
 
338
+ # Snapshot expected effects here β€” after time/dampening but BEFORE profile
339
+ # modifiers. The anomaly = actual_delta - expected gives the agent a direct
340
+ # per-step fingerprint of the hidden profile modifier.
341
+ expected_no_profile = dict(effects)
342
+
343
  # --- 4. Apply profile modifiers ---
344
  effects = self._apply_profile_modifiers(effects, action_name, slot)
345
 
 
348
  for meter in METERS:
349
  if meter != "vitality" and effects[meter] > 0:
350
  effects[meter] *= vitality_factor
351
+ # Apply same vitality factor to expected for fair anomaly comparison
352
+ for meter in METERS:
353
+ if meter != "vitality" and expected_no_profile[meter] > 0:
354
+ expected_no_profile[meter] *= vitality_factor
355
 
356
  # --- 6. Apply passive decays ---
357
  self._apply_passive_decays()
 
387
  done = self._timestep >= MAX_STEPS
388
 
389
  # --- 12. Build reward breakdown ---
390
+ # Includes: per-meter deltas, per-meter anomalies (actual - expected),
391
+ # event flag, and final_score on the last step.
392
  reward_breakdown: Dict[str, float] = {}
393
  for meter in METERS:
394
  reward_breakdown[f"{meter}_delta"] = round(deltas[meter], 4)
395
+ reward_breakdown[f"{meter}_anomaly"] = round(
396
+ deltas[meter] - expected_no_profile[meter], 4
397
+ )
398
  if active_event:
399
  reward_breakdown["event"] = 1.0
400
 
 
415
  self._state.connection = round(self._connection, 4)
416
  self._state.active_event = active_event
417
 
418
+ # --- 15. Append completed step to rolling history ---
419
+ self._step_history.append({
420
+ "step": current_step,
421
+ "action": action_name,
422
+ "reward": reward,
423
+ "vitality_delta": round(deltas["vitality"], 4),
424
+ "cognition_delta": round(deltas["cognition"], 4),
425
+ "progress_delta": round(deltas["progress"], 4),
426
+ "serenity_delta": round(deltas["serenity"], 4),
427
+ "connection_delta": round(deltas["connection"], 4),
428
+ })
429
+ if len(self._step_history) > HISTORY_LENGTH:
430
+ self._step_history.pop(0)
431
+
432
  return self._make_observation(
433
  reward=reward,
434
  done=done,
435
  active_event=active_event,
436
  reward_breakdown=reward_breakdown,
437
+ deltas=deltas,
438
+ last_action=action_name,
439
  )
440
 
441
  # ------------------------------------------------------------------
 
549
 
550
  # Work vitality recovery: workaholic gets vitality from productive work
551
  wvr = profile.get("work_vitality_recovery", 0.0)
552
+ if wvr > 0 and action_name in WORK_ACTIONS:
553
  effects["vitality"] += wvr
554
 
555
  # Low serenity amplification (stress spiral)
 
578
  return reward * REWARD_SCALE
579
 
580
  def _grade_episode(self) -> float:
581
+ """
582
+ Compute final episode score in [0, 1].
583
+
584
+ Scoring breakdown:
585
+ 0.25 β€” meter balance (high mean, low variance)
586
+ 0.20 β€” crash-free ratio (no critical meter drops)
587
+ 0.20 β€” progress made
588
+ 0.15 β€” connection maintained
589
+ 0.05 β€” efficiency (average reward)
590
+ 0.15 β€” discovery bonus (profile-adapted strategy in second half)
591
+ """
592
  meters = {m: getattr(self, f"_{m}") for m in METERS}
593
 
594
+ # 1. Meter balance (0.25): high mean, low variance
595
  values = list(meters.values())
596
  mean_meter = sum(values) / len(values)
597
  variance = sum((v - mean_meter) ** 2 for v in values) / len(values)
598
  balance_score = max(0.0, mean_meter - variance)
599
 
600
+ # 2. No crashes (0.20): fraction of steps without critical meters
601
  steps = max(self._timestep, 1)
602
  crash_free_ratio = 1.0 - (self._crash_count / (steps * len(METERS)))
603
 
 
607
  # 4. Connection maintained (0.15)
608
  connection_score = self._connection
609
 
610
+ # 5. Efficiency (0.05): normalized average reward
611
  avg_reward = self._total_reward / steps
612
  efficiency_score = max(0.0, min(1.0, (avg_reward + 1.0) / 2.0))
613
 
614
+ # 6. Discovery bonus (0.15): did the agent adapt its strategy to the
615
+ # hidden profile in the second half of the week (steps 14–27)?
616
+ # This is the only component that directly rewards personality discovery.
617
+ second_half = self._recent_actions[14:]
618
+ if len(second_half) > 0:
619
+ profile_name = self._profile["name"]
620
+ if profile_name == "introvert_morning":
621
+ # Introvert should minimise social actions
622
+ social_frac = sum(1 for a in second_half if a in SOCIAL_ACTIONS) / len(second_half)
623
+ discovery_score = max(0.0, 1.0 - social_frac * 2.5)
624
+ elif profile_name == "extrovert_night_owl":
625
+ # Extrovert should embrace social actions
626
+ social_frac = sum(1 for a in second_half if a in SOCIAL_ACTIONS) / len(second_half)
627
+ discovery_score = min(1.0, social_frac * 2.5)
628
+ elif profile_name == "workaholic_stoic":
629
+ # Workaholic should front-load work actions
630
+ work_frac = sum(1 for a in second_half if a in WORK_ACTIONS) / len(second_half)
631
+ discovery_score = min(1.0, work_frac * 1.5)
632
+ else:
633
+ discovery_score = 0.5
634
+ else:
635
+ discovery_score = 0.5
636
+
637
  score = (
638
+ 0.25 * balance_score
639
+ + 0.20 * crash_free_ratio
640
  + 0.20 * progress_score
641
  + 0.15 * connection_score
642
+ + 0.05 * efficiency_score
643
+ + 0.15 * discovery_score
644
  )
645
  return max(0.0, min(1.0, score))
646
 
 
650
  done: bool,
651
  active_event: Optional[str],
652
  reward_breakdown: Optional[Dict[str, float]] = None,
653
+ deltas: Optional[Dict[str, float]] = None,
654
+ last_action: Optional[str] = None,
655
  ) -> RhythmObservation:
656
  """Build the observation returned to the agent (hides profile)."""
657
+ step_records = [
658
+ StepRecord(
659
+ step=h["step"],
660
+ action=h["action"],
661
+ reward=h["reward"],
662
+ vitality_delta=h["vitality_delta"],
663
+ cognition_delta=h["cognition_delta"],
664
+ progress_delta=h["progress_delta"],
665
+ serenity_delta=h["serenity_delta"],
666
+ connection_delta=h["connection_delta"],
667
+ )
668
+ for h in self._step_history
669
+ ]
670
+
671
  return RhythmObservation(
672
  timestep=self._timestep,
673
  day=self._timestep // SLOTS_PER_DAY,
 
682
  reward_breakdown=reward_breakdown or {},
683
  reward=reward,
684
  done=done,
685
+ # First-class delta fields (from this step; zero on reset)
686
+ vitality_delta=round(deltas["vitality"], 4) if deltas else 0.0,
687
+ cognition_delta=round(deltas["cognition"], 4) if deltas else 0.0,
688
+ progress_delta=round(deltas["progress"], 4) if deltas else 0.0,
689
+ serenity_delta=round(deltas["serenity"], 4) if deltas else 0.0,
690
+ connection_delta=round(deltas["connection"], 4) if deltas else 0.0,
691
+ last_action=last_action,
692
+ # Rolling history of the last HISTORY_LENGTH completed steps
693
+ step_history=step_records,
694
  )