| """Streamlit visualizer for the evaluation model outputs. |
| |
| Run the following command to start the visualizer: |
| streamlit run app.py --server.port 8501 --server.address 0.0.0.0 |
| NOTE: YOU SHOULD BE AT THE ROOT OF THE REPOSITORY TO RUN THIS COMMAND. |
| |
| Mostly borrow from: https://github.com/xingyaoww/mint-bench/blob/main/scripts/visualizer.py |
| """ |
|
|
| import re |
| import os |
| import json |
| import random |
| from glob import glob |
|
|
| import altair as alt |
| import pandas as pd |
| import streamlit as st |
| |
| from utils import filter_dataframe, dataframe_with_selections, load_filepaths |
| from utils.swe_bench import load_df_from_selected_filepaths, agg_stats |
|
|
|
|
| |
| st.set_page_config( |
| layout='wide', |
| page_title='π OpenDevin SWE-Bench Output Visualizer', |
| page_icon='π' |
| ) |
| st.write('# π OpenDevin SWE-Bench Output Visualizer') |
|
|
| if __name__ == '__main__': |
|
|
| |
| filepaths = load_filepaths() |
|
|
| st.markdown('**Select file(s) to visualize**') |
| filepaths = filter_dataframe(filepaths) |
| |
| |
| col1, col2 = st.columns([0.15, 1]) |
| select_all = col1.button('Select all') |
| deselect_all = col2.button('Deselect all') |
| selected_values = st.query_params.get('filepaths', '').split(',') |
| selected_values = filepaths['filepath'].tolist() if select_all else selected_values |
| selected_values = [] if deselect_all else selected_values |
|
|
| selection = dataframe_with_selections( |
| filepaths, |
| selected_values=selected_values, |
| selected_col='filepath', |
| ) |
| st.write("Your selection:") |
| st.write(selection) |
| select_filepaths = selection['filepath'].tolist() |
| |
| st.query_params['filepaths'] = select_filepaths |
|
|
| df = load_df_from_selected_filepaths(select_filepaths) |
| st.write(f'{len(df)} rows found.') |
|
|
| |
|
|
| st.markdown('---') |
| st.markdown('## Aggregated Stats') |
| stats_df = agg_stats(df) |
| if len(stats_df) == 0: |
| st.write('No data to visualize.') |
| st.stop() |
| resolved_rate = stats_df['resolved'].sum() / len(stats_df) |
|
|
| st.markdown( |
| f'- **Resolved Rate**: **{resolved_rate:2%}** : {stats_df["resolved"].sum()} / {len(df)}\n' |
| ) |
|
|
|
|
| def plot_stats(stats_df, df): |
| st.write('### Distribution of Number of Turns (by Resolved)') |
| _stat = stats_df.groupby('resolved')['n_turns'].describe() |
| |
| _stat.loc['all'] = stats_df['n_turns'].describe() |
| st.dataframe(_stat, use_container_width=True) |
| chart = ( |
| alt.Chart(stats_df, title='Distribution of Number of Turns by Resolved') |
| .mark_bar() |
| .encode( |
| x=alt.X( |
| 'n_turns', type='quantitative', title='Number of Turns', bin={'step': 1} |
| ), |
| y=alt.Y('count()', type='quantitative', title='Count'), |
| color=alt.Color('resolved', type='nominal', title='Resolved'), |
| ) |
| .properties(width=400) |
| ) |
| st.altair_chart(chart, use_container_width=True) |
|
|
| if 'repo' in stats_df.columns: |
| st.markdown('### Count of Resolved by Repo') |
| col1, col2 = st.columns([0.3, 0.7]) |
| with col1: |
| resolved_by_repo = stats_df.groupby('repo')['resolved'].sum() |
| total_by_repo = stats_df.groupby('repo')['resolved'].count() |
| resolved_rate_by_repo = resolved_by_repo / total_by_repo |
| resolved_by_repo_df = pd.DataFrame( |
| { |
| 'Resolved': resolved_by_repo, |
| 'Total': total_by_repo, |
| 'Resolved Rate': resolved_rate_by_repo, |
| } |
| ).sort_values('Resolved Rate', ascending=False) |
| st.dataframe( |
| resolved_by_repo_df.style.format('{:.2%}', subset=['Resolved Rate']) |
| .format('{:.0f}', subset=['Resolved', 'Total']) |
| .set_caption('Count of Resolved by Repo'), |
| height=400, |
| ) |
| with col2: |
| chart = ( |
| alt.Chart( |
| resolved_by_repo_df.reset_index(), title='Count of Resolved by Repo' |
| ) |
| .mark_bar() |
| .encode( |
| x=alt.X( |
| 'Resolved Rate', |
| type='quantitative', |
| title='Resolved Rate', |
| axis=alt.Axis(format='%'), |
| scale=alt.Scale(domain=(0, 1)), |
| ), |
| y=alt.Y('repo', type='nominal', title='Repo', sort='-x'), |
| color=alt.Color( |
| 'Resolved Rate', type='quantitative', title='Resolved Rate' |
| ), |
| ) |
| .properties(height=400) |
| ) |
| st.altair_chart(chart, use_container_width=True) |
|
|
| |
| obs_lengths = [] |
| for _, entry in df.iterrows(): |
| if entry['history'] is None: |
| continue |
| for _, (_, obs) in enumerate(entry['history']): |
| if 'content' in obs: |
| obs_lengths.append(len(obs['content'])) |
| st.write('### Distribution of #char of Observation Content') |
| obs_lengths = pd.Series(obs_lengths).to_frame().rename(columns={0: 'value'}) |
| |
| |
| quantiles = [0.7, 0.8, 0.9, 0.95, 0.97, 0.99] |
| quantile_stats = obs_lengths['value'].quantile(quantiles).to_frame() |
| |
| quantile_stats.index = [f'{q*100:.0f}%' for q in quantiles] |
| |
| quantile_stats = pd.concat([obs_lengths.describe(), quantile_stats]).sort_index() |
| st.dataframe(quantile_stats.T, use_container_width=True) |
|
|
|
|
| with st.expander('See stats', expanded=True): |
| plot_stats(stats_df, df) |
|
|
| |
| st.markdown('---') |
| st.markdown('## Visualize a Row') |
| |
| if st.button('Randomly Select a Row'): |
| row_id = random.choice(stats_df['idx'].values) |
| st.query_params['row_idx'] = str(row_id) |
|
|
| if st.button('Clear Selection'): |
| st.query_params['row_idx'] = '' |
|
|
| selected_row = dataframe_with_selections( |
| stats_df, |
| list( |
| filter( |
| lambda x: x is not None, |
| map( |
| lambda x: int(x) if x else None, |
| st.query_params.get('row_idx', '').split(','), |
| ), |
| ) |
| ), |
| selected_col='idx', |
| ) |
| if len(selected_row) == 0: |
| st.write('No row selected.') |
| st.stop() |
| elif len(selected_row) > 1: |
| st.write('More than one row selected.') |
| st.stop() |
| row_id = selected_row['idx'].values[0] |
|
|
| |
| st.query_params['filepaths'] = select_filepaths |
| st.query_params['row_idx'] = str(row_id) |
|
|
| row_id = st.number_input( |
| 'Select a row to visualize', min_value=0, max_value=len(df) - 1, value=row_id |
| ) |
| row = df.iloc[row_id] |
|
|
| |
| st.write(f'Visualizing row `{row_id}`') |
| row_dict = df.iloc[row_id] |
|
|
| n_turns = len(row_dict['history']) |
| st.write(f'Number of turns: {n_turns}') |
|
|
| with st.expander('Raw JSON', expanded=False): |
| st.markdown('### Raw JSON') |
| st.json(row_dict.to_dict()) |
|
|
|
|
| def visualize_action(action): |
| if action['action'] == 'run': |
| thought = action['args'].get('thought', '') |
| if thought: |
| st.markdown(thought) |
| st.code(action['args']['command'], language='bash') |
| elif action['action'] == 'run_ipython': |
| thought = action['args'].get('thought', '') |
| if thought: |
| st.markdown(thought) |
| st.code(action['args']['code'], language='python') |
| elif action['action'] == 'talk': |
| st.markdown(action['args']['content']) |
| elif action['action'] == 'message': |
| st.markdown(action['args']['content']) |
| else: |
| st.json(action) |
|
|
|
|
| def visualize_obs(observation): |
| if 'content' in observation: |
| num_char = len(observation['content']) |
| st.markdown(rf'\# characters: {num_char}') |
| if observation['observation'] == 'run': |
| st.code(observation['content'], language='plaintext') |
| elif observation['observation'] == 'run_ipython': |
| st.code(observation['content'], language='python') |
| elif observation['observation'] == 'message': |
| st.markdown(observation['content']) |
| elif observation['observation'] == 'null': |
| st.markdown('null observation') |
| else: |
| st.json(observation) |
|
|
|
|
| def visualize_row(row_dict): |
| st.markdown('### Test Result') |
| test_result = row_dict['test_result']['result'] |
| st.write(pd.DataFrame([test_result])) |
|
|
| if row_dict['error']: |
| st.markdown('### Error') |
| st.code(row_dict['error'], language='plaintext') |
|
|
| st.markdown('### Interaction History') |
| with st.expander('Interaction History', expanded=True): |
| st.code(row_dict['instruction'], language='plaintext') |
| history = row['history'] |
| for i, (action, observation) in enumerate(history): |
| st.markdown(f'#### Turn {i + 1}') |
| st.markdown('##### Action') |
| visualize_action(action) |
| st.markdown('##### Observation') |
| visualize_obs(observation) |
|
|
| st.markdown('### Agent Patch') |
| with st.expander('Agent Patch', expanded=False): |
| st.code(row_dict['git_patch'], language='diff') |
|
|
| st.markdown('### Gold Patch') |
| with st.expander('Gold Patch', expanded=False): |
| st.code(row_dict['swe_instance']['patch'], language='diff') |
|
|
| st.markdown('### Test Output') |
| with st.expander('Test Output', expanded=False): |
| st.code(row_dict['test_result']['test_output'], language='plaintext') |
|
|
|
|
| visualize_row(row_dict) |
|
|
|
|
| def visualize_swe_instance(row_dict): |
| st.markdown('### SWE Instance') |
| swe_instance = row_dict['swe_instance'] |
| st.markdown(f'Repo: `{swe_instance["repo"]}`') |
| st.markdown(f'Instance ID: `{swe_instance["instance_id"]}`') |
| st.markdown(f'Base Commit: `{swe_instance["base_commit"]}`') |
|
|
| if 'fine_grained_report' in row_dict: |
| st.markdown('### Fine Grained Report') |
| |
| eval_report = row_dict['fine_grained_report']['eval_report'] |
| st.markdown('#### PASS_TO_PASS') |
| p2p_success = eval_report['PASS_TO_PASS']['success'] |
| p2p_fail = eval_report['PASS_TO_PASS']['failure'] |
| |
| p2p_success = pd.Series(p2p_success).to_frame('test') |
| p2p_success['success'] = True |
| p2p_fail = pd.Series(p2p_fail).to_frame('test') |
| p2p_fail['success'] = False |
| p2p = pd.concat([p2p_success, p2p_fail]) |
| st.dataframe(p2p) |
|
|
| st.markdown('#### FAIL_TO_PASS') |
| f2p_success = eval_report['FAIL_TO_PASS']['success'] |
| f2p_fail = eval_report['FAIL_TO_PASS']['failure'] |
| |
| f2p_success = pd.Series(f2p_success).to_frame('test') |
| f2p_success['success'] = True |
| f2p_fail = pd.Series(f2p_fail).to_frame('test') |
| f2p_fail['success'] = False |
| f2p = pd.concat([f2p_success, f2p_fail]) |
| st.dataframe(f2p) |
| else: |
| st.markdown('#### PASS_TO_PASS') |
| st.write(pd.Series(json.loads(swe_instance['PASS_TO_PASS']))) |
| st.markdown('#### FAIL_TO_PASS') |
| st.write(pd.Series(json.loads(swe_instance['FAIL_TO_PASS']))) |
|
|
|
|
| NAV_MD = """ |
| ## Navigation |
| - [Home](#opendevin-swe-bench-output-visualizer) |
| - [Aggregated Stats](#aggregated-stats) |
| - [Visualize a Row](#visualize-a-row) |
| - [Raw JSON](#raw-json) |
| - [Test Result](#test-result) |
| - [Interaction History](#interaction-history) |
| - [Agent Patch](#agent-patch) |
| - [Gold Patch](#gold-patch) |
| - [Test Output](#test-output) |
| """ |
|
|
| if 'swe_instance' in row_dict: |
| visualize_swe_instance(row_dict) |
| NAV_MD += ( |
| '- [SWE Instance](#swe-instance)\n' |
| ' - [PASS_TO_PASS](#pass-to-pass)\n' |
| ' - [FAIL_TO_PASS](#fail-to-pass)\n' |
| ) |
|
|
| with st.sidebar: |
| st.markdown(NAV_MD) |
|
|