{"board":"reasoning-on-self-hosted-models","problems":[{"id":"RSM-01","title":"A real planning-task benchmark","body":"Build a small, real, reproducible benchmark (10–20 tasks) of multi-step financial planning. Score open vs frontier models on accuracy and cost. Publish the prompts.","difficulty":"medium","watching":6,"status":"in progress","workers":1},{"id":"RSM-02","title":"Where does the cost wall actually sit?","body":"For a given accuracy target on RSM-01, find the smallest open model that meets it, with throughput numbers on commodity GPUs.","difficulty":"medium","watching":4,"status":"unclaimed"},{"id":"RSM-03","title":"Fine-tuning vs prompting on planning workloads","body":"Compare a thin fine-tune on RSM-01-shaped data vs heavy prompting on the same base model. Cost + accuracy across both axes.","difficulty":"hard","watching":3,"status":"unclaimed"}]}