| | import json |
| | import re |
| | import gradio as gr |
| |
|
| | from dotenv import load_dotenv |
| | load_dotenv() |
| |
|
| | from .gen_api_answer import ( |
| | get_atla_response |
| | ) |
| |
|
| | from .prompts import ( |
| | DEFAULT_EVAL_CRITERIA, |
| | DEFAULT_EVAL_PROMPT, |
| | DEFAULT_EVAL_PROMPT_EDITABLE |
| | ) |
| |
|
| | from .random_sample_generation import ( |
| | get_random_human_ai_pair, |
| | get_random_human_ai_ground_truth_pair, |
| | generate_ai_response |
| | ) |
| |
|
| | from common import CSS_STYLES, MAIN_TITLE, HOW_IT_WORKS |
| |
|
| | def parse_variables(prompt): |
| | |
| | variables = re.findall(r"{{(.*?)}}", prompt) |
| | |
| | seen = set() |
| | variables = [ |
| | x.strip() for x in variables if not (x.strip() in seen or seen.add(x.strip())) |
| | ] |
| | return variables |
| |
|
| |
|
| | def get_final_prompt(eval_prompt, variable_values): |
| | |
| | for var, val in variable_values.items(): |
| | eval_prompt = eval_prompt.replace("{{" + var + "}}", val) |
| | return eval_prompt |
| |
|
| |
|
| | def populate_random_example(request: gr.Request, compatible_mode: bool): |
| | """Generate a random human-AI conversation example and reset judge outputs.""" |
| | if compatible_mode: |
| | human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair() |
| | else: |
| | human_msg, ai_msg = get_random_human_ai_pair() |
| | ground_truth_msg = "" |
| | |
| | return [ |
| | gr.update(value=human_msg), |
| | gr.update(value=ai_msg), |
| | gr.update(value="🎲", variant="secondary"), |
| | gr.update(value=""), |
| | gr.update(value=""), |
| | gr.update(value=ground_truth_msg, visible=compatible_mode), |
| | ] |
| |
|
| |
|
| | def create_arena_interface(): |
| | with gr.Blocks(theme="default", css=CSS_STYLES) as interface: |
| | |
| | eval_prompt = gr.Textbox( |
| | value=DEFAULT_EVAL_PROMPT, |
| | visible=False |
| | ) |
| | with gr.Row(): |
| | |
| | model_selector = gr.Dropdown( |
| | choices=["Selene", "Selene Mini"], |
| | value="Selene", |
| | label="Choose your Atla Model", |
| | interactive=True |
| | ) |
| |
|
| | with gr.Row(): |
| | |
| | with gr.Column(scale=1): |
| | with gr.Group(): |
| | human_input = gr.TextArea( |
| | label="👩 User Input", |
| | lines=5, |
| | placeholder="Enter the human message here..." |
| | ) |
| | with gr.Row(): |
| | generate_btn = gr.Button( |
| | "Generate AI Response", |
| | size="sm", |
| | interactive=False |
| | ) |
| | |
| | ai_response = gr.TextArea( |
| | label="🤖 AI Response", |
| | lines=10, |
| | placeholder="Enter the AI response here..." |
| | ) |
| | |
| | |
| | ground_truth = gr.TextArea( |
| | label="🎯 Ground truth response", |
| | lines=10, |
| | placeholder="Enter the ground truth response here...", |
| | visible=False |
| | ) |
| | |
| | with gr.Row(): |
| | random_btn = gr.Button("🎲", scale=2) |
| | send_btn = gr.Button( |
| | value="Run evaluation", |
| | variant="primary", |
| | size="lg", |
| | scale=8 |
| | ) |
| |
|
| | |
| | with gr.Column(scale=1): |
| | gr.Markdown("## 👩⚖️ Atla Evaluation") |
| | with gr.Group(): |
| | with gr.Row(): |
| | score = gr.Textbox(label="Score", lines=1, interactive=False) |
| | critique = gr.TextArea(label="Critique", lines=12, interactive=False) |
| | |
| | gr.Markdown("<br>") |
| | |
| |
|
| | |
| | with gr.Accordion("📝 Edit Judge Prompt", open=False) as prompt_accordion: |
| | gr.Markdown("<br>") |
| | use_reference_toggle = gr.Checkbox( |
| | label="Use a reference response", |
| | value=False |
| | ) |
| | |
| | |
| | with gr.Column(visible=False) as default_prompt_editor: |
| | eval_prompt_editable = gr.TextArea( |
| | value=DEFAULT_EVAL_PROMPT_EDITABLE, |
| | label="Evaluation Criteria", |
| | lines=12 |
| | ) |
| |
|
| | with gr.Row(visible=False) as edit_buttons_row: |
| | cancel_prompt_btn = gr.Button("Cancel") |
| | save_prompt_btn = gr.Button("Save", variant="primary") |
| | |
| | |
| | with gr.Column(visible=True) as compatible_prompt_editor: |
| | eval_criteria_text = gr.TextArea( |
| | label="Evaluation Criteria", |
| | lines=12, |
| | value=DEFAULT_EVAL_CRITERIA, |
| | placeholder="Enter the complete evaluation criteria and scoring rubric..." |
| | ) |
| | with gr.Row(visible=False) as compatible_edit_buttons_row: |
| | compatible_cancel_btn = gr.Button("Cancel") |
| | compatible_save_btn = gr.Button("Save", variant="primary") |
| |
|
| | eval_prompt_previous = gr.State(value=DEFAULT_EVAL_PROMPT_EDITABLE) |
| | is_editing = gr.State(False) |
| | compatible_mode_state = gr.State(False) |
| |
|
| | |
| | def update_model_names(model_a, model_b): |
| | return gr.update(value=f"*Model: {model_a}*"), gr.update( |
| | value=f"*Model: {model_b}*" |
| | ) |
| |
|
| | |
| | last_submission = gr.State({}) |
| |
|
| | |
| | def save_criteria(new_criteria, previous_criteria): |
| | return [ |
| | gr.update(value=new_criteria), |
| | new_criteria, |
| | gr.update(visible=False) |
| | ] |
| |
|
| | def cancel_criteria(previous_criteria): |
| | return [ |
| | gr.update(value=previous_criteria), |
| | previous_criteria, |
| | gr.update(visible=False) |
| | ] |
| |
|
| | def show_criteria_edit_buttons(current_value, previous_value): |
| | |
| | return gr.update(visible=current_value != previous_value) |
| |
|
| | |
| | compatible_save_btn.click( |
| | fn=save_criteria, |
| | inputs=[eval_criteria_text, eval_prompt_previous], |
| | outputs=[eval_criteria_text, eval_prompt_previous, compatible_edit_buttons_row] |
| | ) |
| |
|
| | compatible_cancel_btn.click( |
| | fn=cancel_criteria, |
| | inputs=[eval_prompt_previous], |
| | outputs=[eval_criteria_text, eval_prompt_previous, compatible_edit_buttons_row] |
| | ) |
| |
|
| | eval_criteria_text.change( |
| | fn=show_criteria_edit_buttons, |
| | inputs=[eval_criteria_text, eval_prompt_previous], |
| | outputs=compatible_edit_buttons_row |
| | ) |
| |
|
| | |
| | def toggle_use_reference(checked): |
| | if checked: |
| | human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair() |
| | return { |
| | ground_truth: gr.update(visible=True, value=ground_truth_msg), |
| | human_input: gr.update(value=human_msg), |
| | ai_response: gr.update(value=ai_msg), |
| | score: gr.update(value=""), |
| | critique: gr.update(value=""), |
| | random_btn: gr.update(value="🎲", variant="secondary"), |
| | } |
| | else: |
| | return { |
| | ground_truth: gr.update(visible=False) |
| | } |
| |
|
| | |
| | use_reference_toggle.change( |
| | fn=toggle_use_reference, |
| | inputs=[use_reference_toggle], |
| | outputs=[ |
| | ground_truth, |
| | human_input, |
| | ai_response, |
| | score, |
| | critique, |
| | random_btn, |
| | ] |
| | ) |
| |
|
| | |
| | first_game_state = gr.State(True) |
| |
|
| | |
| | def submit_and_store( |
| | model_choice, |
| | use_reference, |
| | eval_criteria_text, |
| | human_input, |
| | ai_response, |
| | ground_truth, |
| | ): |
| | |
| | prompt_data = { |
| | 'human_input': human_input, |
| | 'ai_response': ai_response, |
| | 'ground_truth': ground_truth if use_reference else None, |
| | 'eval_criteria': eval_criteria_text, |
| | } |
| | |
| | print("\n=== Debug: Prompt data being sent to Selene API ===") |
| | print(json.dumps(prompt_data, indent=2)) |
| | print("============================================\n") |
| | |
| | |
| | model_id = "atla-selene-mini" if model_choice == "Selene Mini" else "atla-selene" |
| | |
| | response = get_atla_response( |
| | model_name=model_id, |
| | prompt=prompt_data, |
| | max_tokens=500, |
| | temperature=0.01 |
| | ) |
| | |
| | |
| | score_text = f"{response['score']}/5" |
| | critique_text = f"{response['critique']}" |
| | |
| | |
| | return score_text, critique_text, gr.update(value="Regenerate evaluation", variant="secondary", interactive=True), gr.update(value="🎲", variant="primary") |
| | |
| |
|
| | |
| | send_btn.click( |
| | fn=submit_and_store, |
| | inputs=[ |
| | model_selector, |
| | use_reference_toggle, |
| | eval_criteria_text, |
| | human_input, |
| | ai_response, |
| | ground_truth, |
| | ], |
| | outputs=[ |
| | score, |
| | critique, |
| | send_btn, |
| | random_btn, |
| | ], |
| | ) |
| |
|
| | |
| | random_btn.click( |
| | fn=populate_random_example, |
| | inputs=[use_reference_toggle], |
| | outputs=[ |
| | human_input, |
| | ai_response, |
| | random_btn, |
| | score, |
| | critique, |
| | ground_truth, |
| | ] |
| | ) |
| |
|
| | |
| | def handle_input_change(): |
| | """Reset UI state when inputs are changed""" |
| | return [ |
| | gr.update(value="Run evaluation", variant="primary"), |
| | gr.update(value="🎲", variant="secondary"), |
| | ] |
| |
|
| | |
| | human_input.change( |
| | fn=handle_input_change, |
| | inputs=[], |
| | outputs=[send_btn, random_btn] |
| | ) |
| |
|
| | ai_response.change( |
| | fn=handle_input_change, |
| | inputs=[], |
| | outputs=[send_btn, random_btn] |
| | ) |
| |
|
| | generate_btn.click( |
| | fn=lambda msg: ( |
| | generate_ai_response(msg)[0], |
| | gr.update( |
| | value="Generate AI Response", |
| | interactive=False |
| | ) |
| | ), |
| | inputs=[human_input], |
| | outputs=[ai_response, generate_btn] |
| | ) |
| |
|
| | human_input.change( |
| | fn=lambda x: gr.update(interactive=bool(x.strip())), |
| | inputs=[human_input], |
| | outputs=[generate_btn] |
| | ) |
| |
|
| | |
| | interface.load( |
| | fn=lambda: populate_random_example(None, False), |
| | inputs=[], |
| | outputs=[ |
| | human_input, |
| | ai_response, |
| | random_btn, |
| | score, |
| | critique, |
| | ground_truth, |
| | ] |
| | ) |
| |
|
| | return interface |
| |
|
| | if __name__ == "__main__": |
| | demo = create_arena_interface() |
| | demo.launch() |