o
    i#                     @  s  d Z ddlmZ ddlZddlZddlZddlZddlZddl	Z	ddl
mZmZmZmZ ddlmZmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZm Z  ddl!Z!ddl"Z"ddl#Z#ddl!m$Z$ ddl%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7m8Z8m9Z9 ddl:m;Z; ddl<m=Z=m>Z>m?Z? ddl@mAZAmBZB ddlCmDZD ddlEmFZF ddlGmHZH ddlImJZJ ddlKmLZL ddlMmNZN ddlOmPZP ddlQmRZRmSZSmTZTmUZU erddlVmWZW ejXdk r	dd lYmZZZ neZZZd!Z[e"j\d"d#Z]e9d$ed%Z^	 e9d&ed%Z_	 e9d'ed%Z`	 d(Za	 d)Zb	 d*Zce)edeS Zee)edeU Zfe)eTZgG d+d, d,e&ee^e_e`f d-d.ZhG d/d0 d0e&ee^e_e`f d-d.Zied1d2G d3d4 d4ee^e_e`f ZjG d5d6 d6e&ee^e_e`f d-d7d8Zkdzd{d@dAZleG dBdC dCZm	d|d}dKdLZnd~dTdUZoe)eepeAf ZqddYdZZred[ d\dd%ZsddadbZtddedfZuddkdlZvddpdqZwddxdyZxdS )a  Dataset management for pydantic evals.

This module provides functionality for creating, loading, saving, and evaluating datasets of test cases.
Each case must have inputs, and can optionally have a name, expected output, metadata, and case-specific evaluators.

Datasets can be loaded from and saved to YAML or JSON files, and can be evaluated against
a task function to produce an evaluation report.
    )annotationsN)	AwaitableCallableMappingSequence)AsyncExitStacknullcontext)
ContextVar)	dataclassfield)iscoroutinefunction)Path)TYPE_CHECKINGAnyGenericLiteralUnioncast)	to_thread)	BaseModel
ConfigDictFieldTypeAdapterValidationErrormodel_serializer)_typing_extra)to_json)SerializationInfoSerializerFunctionWrapHandler)Progress)NotRequiredSelf	TypedDictTypeVar)get_event_loop   )get_unwrapped_function_namelogfire_spantask_group_gather)EvaluationResult	Evaluatorrun_evaluator)DEFAULT_EVALUATORS)EvaluatorContext)EvaluatorFailure)EvaluatorSpec)SpanTree)context_subtree)EvaluationReport
ReportCaseReportCaseAggregateReportCaseFailure)RetryConfig)      )ExceptionGroup)CaseDatasetincrement_eval_metricset_eval_attributezpydantic-evals)
otel_scopeInputsT)defaultOutputT	MetadataTz./test_cases.yamlz./{stem}_schema.jsonz # yaml-language-server: $schema=c                   @  sP   e Zd ZU dZdZded< ded< dZded< dZd	ed
< ee	dZ
ded< dS )
_CaseModelzBInternal model for a case, used for serialization/deserialization.N
str | Nonenamer@   inputsMetadataT | NonemetadataOutputT | Noneexpected_outputdefault_factorylist[EvaluatorSpec]
evaluators)__name__
__module____qualname____doc__rF   __annotations__rI   rK   r   listrO    rV   rV   \/var/www/html/karishye-ai-python/venv/lib/python3.10/site-packages/pydantic_evals/dataset.pyrD   V   s   
 rD   forbidextrac                   @  sL   e Zd ZU dZedddZded< dZded< ded	< eed
Z	ded< dS )_DatasetModelzEInternal model for a dataset, used for serialization/deserialization.N$schema)rA   aliasrE   json_schema_pathrF   z-list[_CaseModel[InputsT, OutputT, MetadataT]]casesrL   rN   rO   )
rP   rQ   rR   rS   r   r^   rT   rF   rU   rO   rV   rV   rV   rW   r[   `   s   
 r[   F)initc                   @  sl   e Zd ZU dZded< 	 ded< 	 dZded< 	 dZd	ed
< 	 eedZ	ded< 	 ddddddddZ
dS )r;   aL  A single row of a [`Dataset`][pydantic_evals.Dataset].

    Each case represents a single test scenario with inputs to test. A case may optionally specify a name, expected
    outputs to compare against, and arbitrary metadata.

    Cases can also have their own specific evaluators which are run in addition to dataset-level evaluators.

    Example:
    ```python
    from pydantic_evals import Case

    case = Case(
        name='Simple addition',
        inputs={'a': 1, 'b': 2},
        expected_output=3,
        metadata={'description': 'Tests basic addition'},
    )
    ```
    rE   rF   r@   rG   NrH   rI   rJ   rK   rL   ,list[Evaluator[InputsT, OutputT, MetadataT]]rO   rV   rF   rI   rK   rO   2tuple[Evaluator[InputsT, OutputT, MetadataT], ...]c                C  s&   || _ || _|| _|| _t|| _dS )a/  Initialize a new test case.

        Args:
            name: Optional name for the case. If not provided, a generic name will be assigned when added to a dataset.
            inputs: The inputs to the task being evaluated.
            metadata: Optional metadata for the case, which can be used by evaluators.
            expected_output: Optional expected output of the task, used for comparison in evaluators.
            evaluators: Tuple of evaluators specific to this case. These are in addition to any
                dataset-level evaluators.

        N)rF   rG   rI   rK   rU   rO   )selfrF   rG   rI   rK   rO   rV   rV   rW   __init__   s
   zCase.__init__)
rF   rE   rG   r@   rI   rH   rK   rJ   rO   rc   )rP   rQ   rR   rS   rT   rI   rK   r   rU   rO   re   rV   rV   rV   rW   r;   j   s"   
 r;   c                      s  e Zd ZU dZdZded< 	 ded< 	 g Zded< 	 dd	d
dd fddZ					deddddfdd Z					deddddfd!d"Z	dddd	d#dgd+d,Z
	dhdid0d1Zeejdjd3d4Ze			dkdld<d=Ze	>		dmdd?dndDdEZe		dodd?dpdHdIZe			dqdrdLdMZded	fdsdPdQZe		dodtdRdSZe		dodudTdUZeejdvdWdXZedwdZd[Zed\d]dxdbdcZ  ZS )yr<   u  A dataset of test [cases][pydantic_evals.Case].

    Datasets allow you to organize a collection of test cases and evaluate them against a task function.
    They can be loaded from and saved to YAML or JSON files, and can have dataset-level evaluators that
    apply to all cases.

    Example:
    ```python
    # Create a dataset with two test cases
    from dataclasses import dataclass

    from pydantic_evals import Case, Dataset
    from pydantic_evals.evaluators import Evaluator, EvaluatorContext


    @dataclass
    class ExactMatch(Evaluator):
        def evaluate(self, ctx: EvaluatorContext) -> bool:
            return ctx.output == ctx.expected_output

    dataset = Dataset(
        cases=[
            Case(name='test1', inputs={'text': 'Hello'}, expected_output='HELLO'),
            Case(name='test2', inputs={'text': 'World'}, expected_output='WORLD'),
        ],
        evaluators=[ExactMatch()],
    )

    # Evaluate the dataset against a task function
    async def uppercase(inputs: dict) -> str:
        return inputs['text'].upper()

    async def main():
        report = await dataset.evaluate(uppercase)
        report.print()
    '''
       Evaluation Summary: uppercase
    ┏━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
    ┃ Case ID  ┃ Assertions ┃ Duration ┃
    ┡━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
    │ test1    │ ✔          │     10ms │
    ├──────────┼────────────┼──────────┤
    │ test2    │ ✔          │     10ms │
    ├──────────┼────────────┼──────────┤
    │ Averages │ 100.0% ✔   │     10ms │
    └──────────┴────────────┴──────────┘
    '''
    ```
    NrE   rF   z'list[Case[InputsT, OutputT, MetadataT]]r_   ra   rO   rV   )rF   rO   +Sequence[Case[InputsT, OutputT, MetadataT]]0Sequence[Evaluator[InputsT, OutputT, MetadataT]]c                  s`   t t  }|D ]}|jdu rq|j|v rtd|j||j qt j||t|d dS )a-  Initialize a new dataset with test cases and optional evaluators.

        Args:
            name: Optional name for the dataset.
            cases: Sequence of test cases to include in the dataset.
            evaluators: Optional sequence of evaluators to apply to all cases in the dataset.
        NDuplicate case name: )rF   r_   rO   )setstrrF   
ValueErroraddsuperre   rU   )rd   rF   r_   rO   
case_namescase	__class__rV   rW   re      s   



zDataset.__init__T)	task_namerI   taskFCallable[[InputsT], Awaitable[OutputT]] | Callable[[InputsT], OutputT]max_concurrency
int | Noneprogressbool
retry_taskRetryConfig | Noneretry_evaluatorsrr   rI   dict[str, Any] | Nonereturn-EvaluationReport[InputsT, OutputT, MetadataT]c             
     s  |pt }|p
|}tj}	|rt nd|dur t|nt ddi}
|dur/||
d< t	d||jtjd|
}pEt	  rSj
d| |	dnddfdd |j }du rnd}d}n
|jd}|jd}t fddtjdD I dH }g }g }|D ]}t|tr|| q|| qt||||||d}dtji}|dur||d< |  }dur||d< |jdur|d|j |d| W d   n1 sw   Y  W d   |S W d   |S 1 sw   Y  |S )aU  Evaluates the test cases in the dataset using the given task.

        This method runs the task on each case in the dataset, applies evaluators,
        and collects results into a report. Cases are run concurrently, limited by `max_concurrency` if specified.

        Args:
            task: The task to evaluate. This should be a callable that takes the inputs of the case
                and returns the output.
            name: The name of the experiment being run, this is used to identify the experiment in the report.
                If omitted, the task_name will be used; if that is not specified, the name of the task function is used.
            max_concurrency: The maximum number of concurrent evaluations of the task to allow.
                If None, all cases will be evaluated concurrently.
            progress: Whether to show a progress bar for the evaluation. Defaults to `True`.
            retry_task: Optional retry configuration for the task execution.
            retry_evaluators: Optional retry configuration for evaluator execution.
            task_name: Optional override to the name of the task being executed, otherwise the name of the task
                function will be used.
            metadata: Optional dict of experiment metadata.

        Returns:
            A report containing the results of the evaluation.
        Ngen_ai.operation.name
experimentrI   evaluate {name})rF   rr   dataset_namen_caseszEvaluating )totalro   !Case[InputsT, OutputT, MetadataT]report_case_namerj   c              	     sv    4 I d H ' t | |jI d H }r"d ur"jdd |W  d   I d H  S 1 I d H s4w   Y  d S )Nr%   )advance)_run_task_and_evaluatorsrO   update)ro   r   result)limiterprogress_barr{   ry   rd   rs   task_idrV   rW   _handle_case;  s   
0z&Dataset.evaluate.<locals>._handle_case032x016xc                   s"   g | ]\}}||f fd d	qS )c                   s    | | j p	d| S )NCase rF   )ro   ir   rV   rW   <lambda>L  s    z-Dataset.evaluate.<locals>.<listcomp>.<lambda>rV   ).0r   ro   r   rV   rW   
<listcomp>K  s    z$Dataset.evaluate.<locals>.<listcomp>r%   )rF   r_   failuresexperiment_metadataspan_idtrace_idr   averagesassertion_pass_ratezlogfire.experiment.metadata)r   )ro   r   r   rj   )r&   lenr_   r   anyio	Semaphorer   r'   rF   r   add_taskcontextr   r   r(   	enumerate
isinstancer4   appendr3   r   
assertionsset_attribute)rd   rs   rF   ru   rw   ry   r{   rr   rI   total_casesextra_attributes	eval_spanr   r   r   cases_and_failuresr_   r   itemreportfull_experiment_metadatar   rV   )r   r   r   r{   ry   rd   rs   r   rW   evaluate  s~   "
	






(999zDataset.evaluatec          	      C  s"   t  | j||||||||dS )a  Evaluates the test cases in the dataset using the given task.

        This is a synchronous wrapper around [`evaluate`][pydantic_evals.Dataset.evaluate] provided for convenience.

        Args:
            task: The task to evaluate. This should be a callable that takes the inputs of the case
                and returns the output.
            name: The name of the experiment being run, this is used to identify the experiment in the report.
                If omitted, the task_name will be used; if that is not specified, the name of the task function is used.
            max_concurrency: The maximum number of concurrent evaluations of the task to allow.
                If None, all cases will be evaluated concurrently.
            progress: Whether to show a progress bar for the evaluation. Defaults to `True`.
            retry_task: Optional retry configuration for the task execution.
            retry_evaluators: Optional retry configuration for evaluator execution.
            task_name: Optional override to the name of the task being executed, otherwise the name of the task
                function will be used.
            metadata: Optional dict of experiment metadata.

        Returns:
            A report containing the results of the evaluation.
        )rF   ru   rw   ry   r{   rr   rI   )r$   run_until_completer   )	rd   rs   rF   ru   rw   ry   r{   rr   rI   rV   rV   rW   evaluate_synci  s   !zDataset.evaluate_syncrb   rG   r@   rH   rK   rJ   rc   Nonec                C  sN   |dd | j D v rtd|ttttf |||||d}| j | dS )ad  Adds a case to the dataset.

        This is a convenience method for creating a [`Case`][pydantic_evals.Case] and adding it to the dataset.

        Args:
            name: Optional name for the case. If not provided, a generic name will be assigned.
            inputs: The inputs to the task being evaluated.
            metadata: Optional metadata for the case, which can be used by evaluators.
            expected_output: The expected output of the task, used for comparison in evaluators.
            evaluators: Tuple of evaluators specific to this case, in addition to dataset-level evaluators.
        c                 S  s   h | ]}|j qS rV   r   )r   ro   rV   rV   rW   	<setcomp>  s    z#Dataset.add_case.<locals>.<setcomp>rh   )rF   rG   rI   rK   rO   N)r_   rk   r;   r@   rB   rC   r   )rd   rF   rG   rI   rK   rO   ro   rV   rV   rW   add_case  s   zDataset.add_case	evaluator&Evaluator[InputsT, OutputT, MetadataT]specific_casec                 C  sZ   |du r| j | dS d}| jD ]}|j|kr |j | d}q|s+td|ddS )a  Adds an evaluator to the dataset or a specific case.

        Args:
            evaluator: The evaluator to add.
            specific_case: If provided, the evaluator will only be added to the case with this name.
                If None, the evaluator will be added to all cases in the dataset.

        Raises:
            ValueError: If `specific_case` is provided but no case with that name exists in the dataset.
        NFTr   z not found in the dataset)rO   r   r_   rF   rk   )rd   r   r   addedro   rV   rV   rW   add_evaluator  s   

zDataset.add_evaluator4tuple[type[InputsT], type[OutputT], type[MetadataT]]c                 C  sb   | j D ]}t|di }t|ddpt|dd }dkr!|  S qtd|  dt tttfS )zGet the type parameters for the Dataset class.

        Returns:
            A tuple of (InputsT, OutputT, MetadataT) types.
        __pydantic_generic_metadata__argsrV   __args__r8   z/Could not determine the generic parameters for z; using `Any` for each. You should explicitly set the generic parameters via `Dataset[MyInputs, MyOutput, MyMetadata]` when serializing or deserializing.)__mro__getattrr   getwarningswarnUserWarningr   )clscrI   r   rV   rV   rW   _params  s   
$

zDataset._paramspath
Path | strfmtLiteral['yaml', 'json'] | Nonecustom_evaluator_types6Sequence[type[Evaluator[InputsT, OutputT, MetadataT]]]r!   c              
   C  sn   t |}| ||}t | }z| j||||jdW S  ty6 } zt| d| j d| d|d}~ww )a  Load a dataset from a file.

        Args:
            path: Path to the file to load.
            fmt: Format of the file. If None, the format will be inferred from the file extension.
                Must be either 'yaml' or 'json'.
            custom_evaluator_types: Custom evaluator classes to use when deserializing the dataset.
                These are additional evaluators beyond the default ones.

        Returns:
            A new Dataset instance loaded from the file.

        Raises:
            ValidationError: If the file cannot be parsed as a valid dataset.
            ValueError: If the format cannot be inferred from the file extension.
        )r   r   default_namez2 contains data that does not match the schema for z:
.N)r   
_infer_fmt	read_text	from_textstemr   rk   rP   )r   r   r   r   rawerV   rV   rW   	from_file  s   zDataset.from_fileyamlr   contentsrj   Literal['yaml', 'json']r   c                C  sB   |dkrt |}| j|||dS |  }||}| |||S )a|  Load a dataset from a string.

        Args:
            contents: The string content to parse.
            fmt: Format of the content. Must be either 'yaml' or 'json'.
            custom_evaluator_types: Custom evaluator classes to use when deserializing the dataset.
                These are additional evaluators beyond the default ones.
            default_name: Default name of the dataset, to be used if not specified in the serialized contents.

        Returns:
            A new Dataset instance parsed from the string.

        Raises:
            ValidationError: If the content cannot be parsed as a valid dataset.
        r   r   )r   	safe_load	from_dict_serialization_typemodel_validate_json_from_dataset_model)r   r   r   r   r   loadeddataset_model_typedataset_modelrV   rV   rW   r     s   

zDataset.from_textdatadict[str, Any]c                C  s    |   }||}| |||S )a<  Load a dataset from a dictionary.

        Args:
            data: Dictionary representation of the dataset.
            custom_evaluator_types: Custom evaluator classes to use when deserializing the dataset.
                These are additional evaluators beyond the default ones.
            default_name: Default name of the dataset, to be used if not specified in the data.

        Returns:
            A new Dataset instance created from the dictionary.

        Raises:
            ValidationError: If the dictionary cannot be converted to a valid dataset.
        )r   model_validater   )r   r   r   r   r   r   rV   rV   rW   r   (  s   
zDataset.from_dictr   *_DatasetModel[InputsT, OutputT, MetadataT]c                 C  sL  t |}g }g }g }|jD ]'}zt|d|}	W n ty. }
 z||
 W Y d}
~
qd}
~
ww ||	 q|jD ]I}g }|jD ](}z	t||j|}W n tya }
 z||
 W Y d}
~
q?d}
~
ww || q?ttt	t
f |j|j|j|jd}||_|| q8|rtt| d|dd | |j|d}|jdu r||_||_|S )a  Create a Dataset from a _DatasetModel.

        Args:
            dataset_model: The _DatasetModel to convert.
            custom_evaluator_types: Custom evaluator classes to register for deserialization.
            default_name: Default name of the dataset, to be used if the value is `None` in the provided model.

        Returns:
            A new Dataset instance created from the _DatasetModel.
        N)rF   rG   rI   rK   z* error(s) loading evaluators from registryr8   )rF   r_   )_get_registryrO   _load_evaluator_from_registryrk   r   r_   rF   r;   r@   rB   rC   rG   rI   rK   r:   r   )r   r   r   r   registryr_   errorsdataset_evaluatorsspecdataset_evaluatorr   rowrO   r   r   rV   rV   rW   r   B  sN   





zDataset._from_dataset_modelschema_pathPath | str | Nonec                 C  s
  t |}| ||}d}|durCt|trt |j|jd}| s,t|}|j| }n||r9tt	||}nt|}| 
|| ddi}|dkrp| jdd|d}tj|dd	}|rit | }	|	 d
| }|| dS ||d< | jdd|d}
||
d
  dS )a  Save the dataset to a file.

        Args:
            path: Path to save the dataset to.
            fmt: Format to use. If None, the format will be inferred from the file extension.
                Must be either 'yaml' or 'json'.
            schema_path: Path to save the JSON schema to. If None, no schema will be saved.
                Can be a string template with {stem} which will be replaced with the dataset filename stem.
            custom_evaluator_types: Custom evaluator classes to include in the schema.
        N)r   use_short_formTr   json)modeby_aliasr   F)	sort_keys
r\      )indentr   r   )r   r   r   rj   formatr   is_absoluteparentis_relative_to_get_relative_path_reference_save_schema
model_dumpr   dump_YAML_SCHEMA_LINE_PREFIX
write_textmodel_dump_json)rd   r   r   r   r   
schema_refr   dumped_datacontentyaml_language_server_line	json_datarV   rV   rW   to_filey  s0   

zDataset.to_filec                   s  t |}g  | D ]\}t|}|dd i }t|j D ]"}|	|j
t |j|jur=t||j
  ||j
< q#||j
 ||j
< q#dfdd	}t|d
ksU|s\ t  t|dkrr| \} |d|i nt|dkr| \} |d|i t|dkr|d|}	 |d|	i q
|  \}
}}G  fdddtdd}G  fdddtdd}| }ddi|d d< |S )aj  Generate a JSON schema for this dataset type, including evaluator details.

        This is useful for generating a schema that can be used to validate YAML-format dataset files.

        Args:
            custom_evaluator_types: Custom evaluator classes to include in the schema.

        Returns:
            A dictionary representing the JSON schema.
        r}   Ncls_name_prefixrj   fieldsr   r   c                   s*   t |  d  |}tddd}||_|S )N_rX   TrZ   arbitrary_types_allowed)r"   r   __pydantic_config__)r	  r
  tdconfigr   rV   rW   _make_typed_dict  s   zCDataset.model_json_schema_with_evaluators.<locals>._make_typed_dictr   r%   short_evaluatorevaluator_paramsr   c                      sN   e Zd ZU dZded< ded< dZded< dZded	<  r%g Zd
ed< dS dS )z7Dataset.model_json_schema_with_evaluators.<locals>.CaseNrE   rF   in_typerG   zmeta_type | NonerI   zout_type | NonerK   *list[Union[tuple(evaluator_schema_types)]]rO   )rP   rQ   rR   rF   rT   rI   rK   rO   rV   evaluator_schema_typesrV   rW   r;     s   
 r;   rX   rY   c                      s6   e Zd ZU dZded< ded<  rg Zded< dS dS )z:Dataset.model_json_schema_with_evaluators.<locals>.DatasetNrE   rF   z
list[Case]r_   r  rO   )rP   rQ   rR   rF   rT   rO   rV   r  rV   rW   r<     s   
 r<   typestring
propertiesr\   )r	  rj   r
  r   r}   r   )r   itemsr   get_function_type_hintspopinspect	signature
parametersvalues
setdefaultrF   r   rA   emptyr    r   r   r   r   r   model_json_schema)r   r   r   evaluator_class
type_hintsrequired_type_hintspr  type_hint_type	params_tdr  out_type	meta_typer;   r<   json_schemarV   )r  rF   rW   !model_json_schema_with_evaluators  s<   



z)Dataset.model_json_schema_with_evaluatorsc                 C  sL   t |}| |}t|dd d }| r| |kr$|| dS dS )zSave the JSON schema for this dataset type to a file.

        Args:
            path: Path to save the schema to.
            custom_evaluator_types: Custom evaluator classes to include in the schema.
        r   )r   r   N)r   r.  r   decodeexistsr   r  )r   r   r   r-  schema_contentrV   rV   rW   r     s   

zDataset._save_schema0type[_DatasetModel[InputsT, OutputT, MetadataT]]c                 C  s   |   \}}}t|||f S )zGet the serialization type for this dataset class.

        Returns:
            A _DatasetModel type with the same generic parameters as this Dataset class.
        )r   r[   )r   
input_typeoutput_typemetadata_typerV   rV   rW   r     s   zDataset._serialization_typer   c                 C  s@   |dur|S |j  }|dv rdS |dkrdS td|jd)ah  Infer the format to use for a file based on its extension.

        Args:
            path: The path to infer the format for.
            fmt: The explicitly provided format, if any.

        Returns:
            The inferred format ('yaml' or 'json').

        Raises:
            ValueError: If the format cannot be inferred from the file extension.
        N>   .yml.yamlr   z.jsonr   z$Could not infer format for filename z/. Use the `fmt` argument to specify the format.)suffixlowerrk   rF   )r   r   r   r8  rV   rV   rW   r     s   
zDataset._infer_fmtwrap)r   nxtr   infor   c                 C  sH   t tttf dB |j}t|tr |d }r d|i|| B S || S )zAdd the JSON schema path to the serialized output.

        See <https://github.com/json-schema-org/json-schema-spec/issues/828> for context, that seems to be the nearest
        there is to a spec for this.
        Nr\   )r   dictrj   r   r   r   r   )rd   r;  r<  r   schemarV   rV   rW   _add_json_schema)  s   zDataset._add_json_schema)rF   rE   r_   rf   rO   rg   )NNTNN)rs   rt   rF   rE   ru   rv   rw   rx   ry   rz   r{   rz   rr   rE   rI   r|   r}   r~   )rF   rE   rG   r@   rI   rH   rK   rJ   rO   rc   r}   r   N)r   r   r   rE   r}   r   )r}   r   )NrV   )r   r   r   r   r   r   r}   r!   )r   rV   )
r   rj   r   r   r   r   r   rE   r}   r!   )rV   )r   r   r   r   r   rE   r}   r!   )rV   N)r   r   r   r   r   rE   r}   r!   )r   r   r   r   r   r   r   r   )r   r   r}   r   )r   r   r   r   )r}   r2  )r   r   r   r   r}   r   )r;  r   r<  r   r}   r   )rP   rQ   rR   rS   rF   rT   rO   re   r   r   r   r   classmethod	functoolscacher   r   r   r   r   DEFAULT_SCHEMA_PATH_TEMPLATEr  r.  r   r   r   r   r?  __classcell__rV   rV   rp   rW   r<      s   
 2 	j	1#9/L	r<   Tr   targetr   source_prefixrj   r}   c                 C  sV   |   s|  } zt| t| | W S  ty*   t| |j| dd Y S w )at  Get a relative path reference from source to target.

    Recursively resolve a relative path to target from source, adding '..' as needed.
    This is useful for creating a relative path reference from a source file to a target file.

    Args:
        target: The target path to reference.
        source: The source path to reference from.
        _prefix: Internal prefix used during recursion.

    Returns:
        A Path object representing the relative path from source to target.

    Example:
        If source is '/a/b/c.py' and target is '/a/d/e.py', the relative path reference
        would be '../../d/e.py'.
    z../)rI  )r   resolver   relative_tork   r   r   )rG  rH  rI  rV   rV   rW   r   7  s   r   c                   @  sX   e Zd ZU dZededZded< ededZded< dddZ	dddZ
dddZdS )_TaskRunz>Internal class to track metrics and attributes for a task run.F)r`   rM   r   
attributeszdict[str, int | float]metricsrF   rj   valueint | floatr}   r   c                 C     || j |< dS )zRecord a metric value.

        Args:
            name: The name of the metric.
            value: The value of the metric.
        N)rN  rd   rF   rO  rV   rV   rW   record_metric\     z_TaskRun.record_metricamountc                 C  s:   | j |d}|| }|dkr|dkrdS | || dS )zIncrement a metric value.

        Args:
            name: The name of the metric.
            amount: The amount to increment by.

        Note:
            If the current value is 0 and the increment amount is 0, no metric will be recorded.
        r   N)rN  r   rS  )rd   rF   rU  current_valueincremented_valuerV   rV   rW   increment_metrice  s
   
z_TaskRun.increment_metricr   c                 C  rQ  )zRecord an attribute value.

        Args:
            name: The name of the attribute.
            value: The value of the attribute.
        N)rM  rR  rV   rV   rW   record_attributeu  rT  z_TaskRun.record_attributeN)rF   rj   rO  rP  r}   r   rF   rj   rU  rP  r}   r   rF   rj   rO  r   r}   r   )rP   rQ   rR   rS   r   r=  rM  rT   rN  rS  rX  rY  rV   rV   rV   rW   rL  U  s   
 

	rL  rs   1Callable[[InputsT], Awaitable[OutputT] | OutputT]ro   r   retryrz   -EvaluatorContext[InputsT, OutputT, MetadataT]c                   s   fdd}|rddl m} |di ||}| I dH \}}}}t|trv|D ]K}	|	j D ]C\}
}|
dkrD|dkrD|dd	 q1t|ttB sLq1|
d
krW|d| q1|
	drf||

d| q1|
	drt||

d| q1q*ttttf  j j j j||||j|jd	S )an  Run a task on a case and return the context for evaluators.

    Args:
        task: The task to run.
        case: The case to run the task on.
        retry: The retry config to use.

    Returns:
        An EvaluatorContext containing the inputs, actual output, expected output, and metadata.

    Raises:
        Exception: Any exception raised by the task.
    c               	     s   t  } t d urtdt| }zctdtdB}t /}t	 }t
r5tt jI d H }nttt jI d H }t	 | }W d    n1 sRw   Y  W d    n1 saw   Y  t||}| |||fW t| S t| w )NzCA task run has already been entered. Task runs should not be nestedzexecute {task})rs   )rL  _CURRENT_TASK_RUNr   RuntimeErrorri   r'   r&   r2   timeperf_counterr   r   rB   rG   r   run_sync_get_span_durationreset)	task_run_token	task_span
span_tree_t0task_output_fallback_duration	duration_ro   rs   rV   rW   	_run_once  s.   
 

z_run_task.<locals>._run_oncer   )r]  Nr   chatrequestsr%   zoperation.costcostzgen_ai.usage.details.zgen_ai.usage.)	rF   rG   rI   rK   outputduration
_span_treerM  rN  rV   )pydantic_ai.retriesr]  r   r1   rM  r  rX  intfloat
startswithremoveprefixr.   r@   rB   rC   rF   rG   rI   rK   rN  )rs   ro   r]  ro  tenacity_retrytask_runtask_outputrt  	span_treenodekvrV   rn  rW   	_run_task  s@   


r  rt   r   r   ra   ry   r{   XReportCase[InputsT, OutputT, MetadataT] | ReportCaseFailure[InputsT, OutputT, MetadataT]c                   s  d}d}zt dt| ||j|j|jd}|j}	|	dur'|	jd}|	jd}t }
t	| ||I dH |
dj |
dj |
dj |
d	j |j| }g }g }|rt fd
d|D I dH }|D ]}t|try|| ql|| qlt|\}}}|
dt| |
dt| |
dt| W d   n1 sw   Y  t |
 }ttttf ||j|j|jjjj|||jt|||||dW S  ty	 } z$ttttf ||j|j|jt|j  d| t!" ||dW  Y d}~S d}~ww )a  Run a task on a case and evaluate the results.

    Args:
        task: The task to run.
        case: The case to run the task on.
        report_case_name: The name to use for this case in the report.
        dataset_evaluators: Evaluators from the dataset to apply to this case.
        retry_task: The retry config to use for running the task.
        retry_evaluators: The retry config to use for running the evaluators.

    Returns:
        A ReportCase containing the evaluation results.
    Nzcase: {case_name})rr   	case_namerG   rI   rK   r   r   rs  task_durationrN  rM  c                   s   g | ]}|f fd d	qS )c                   s   t |  S r@  r+   )evr{   scoring_contextrV   rW   r     s    z5_run_task_and_evaluators.<locals>.<listcomp>.<lambda>rV   )r   r  r  rV   rW   r     s    z,_run_task_and_evaluators.<locals>.<listcomp>r   scoreslabels)rF   rG   rI   rK   rs  rN  rM  r  r  r   r  total_durationr   r   evaluator_failures: )rF   rG   rI   rK   error_messageerror_stacktracer   r   )#r'   r&   rG   rI   rK   r   r   r   ra  r  r   rs  rt  rN  rM  rO   r(   r   r/   r   extend _group_evaluator_outputs_by_type_evaluation_results_adapterdump_pythonr4   r@   rB   rC   rd  	Exceptionr6   r  rP   	traceback
format_exc)rs   ro   r   r   ry   r{   r   r   	case_spanr   rj  rO   evaluator_outputsr  evaluator_outputs_by_taskoutputsr   r  r  rl  excrV   r  rW   r     s   




&r   evaluation_resultsSequence[EvaluationResult]ttuple[dict[str, EvaluationResult[bool]], dict[str, EvaluationResult[int | float]], dict[str, EvaluationResult[str]]]c                 C  s   i }i }i }t t  }| D ]Q}|j}||v r5d}| d| |v r.|d7 }| d| |v s!| d| }|| |t }rF|||< q|tt }	rS|	||< q|t }
r^|
||< q|||fS )a8  Group evaluator outputs by their result type.

    Args:
        evaluation_results: Sequence of evaluation results to group.

    Returns:
        A tuple of dictionaries mapping evaluator names to their results, grouped by result type:
        (success_evaluations, metric_evaluations, string_evaluations)
    r   r  r%   )ri   rj   rF   rl   downcastrx   rw  rx  )r  r   r  r  
seen_nameserrF   r8  	assertionscorelabelrV   rV   rW   r  0  s*   




r  z_TaskRun | Noner_  rF   rO  r   r   c                 C  $   t  }|dur|| | dS dS )zSet an attribute on the current task run.

    Args:
        name: The name of the attribute.
        value: The value of the attribute.
    N)r_  r   rY  )rF   rO  current_caserV   rV   rW   r>   Y     r>   rU  rP  c                 C  r  )zIncrement a metric on the current task run.

    Args:
        name: The name of the metric.
        amount: The amount to increment by.
    N)r_  r   rX  )rF   rU  r  rV   rV   rW   r=   e  r  r=   spanlogfire_api.LogfireSpanfallbackrx  c              	   C  s.   z	| j | j d W S  ttfy   | Y S w )a0  Calculate the duration of a span in seconds.

    We prefer to obtain the duration from a span for the sake of consistency with observability and to make
    the values more reliable during testing. However, if the span is not available (e.g. when using logfire_api
    without logfire installed), we fall back to the provided duration.

    Args:
        span: The span to calculate the duration for.
        fallback: The fallback duration to use if unable to obtain the duration from the span.

    Returns:
        The duration of the span in seconds.
    i ʚ;)end_time
start_timeAttributeError	TypeError)r  r  rV   rV   rW   rd  q  s
   rd  r   r   :Mapping[str, type[Evaluator[InputsT, OutputT, MetadataT]]]c                 C  s   i }| D ]/}t |tstd| dd|jvr td| d| }||v r/td||||< qtD ]
}|| | q6|S )zCreate a registry of evaluator types from default and custom evaluators.

    Args:
        custom_evaluator_types: Additional evaluator classes to include in the registry.

    Returns:
        A mapping from evaluator names to evaluator classes.
    zBAll custom evaluator classes must be subclasses of Evaluator, but z is not__dataclass_fields__zFAll custom evaluator classes must be decorated with `@dataclass`, but z Duplicate evaluator class name: )
issubclassr*   rk   __dict__get_serialization_namer-   r"  )r   r   r%  rF   rV   rV   rW   r     s"   




r   r   r  rE   r   r0   r   c              
   C  s   |  |j}|du rtd|jdt|   dz
||ji |jW S  tyJ } z|dur5d|nd}td|jd| d	| |d}~ww )
a  Load an evaluator from the registry based on a specification.

    Args:
        registry: Mapping from evaluator names to evaluator classes.
        case_name: Name of the case this evaluator will be used for, or None for dataset-level evaluators.
        spec: Specification of the evaluator to load.

    Returns:
        An initialized evaluator instance.

    Raises:
        ValueError: If the evaluator name is not found in the registry.
    Nz
Evaluator zA is not in the provided `custom_evaluator_types`. Valid choices: zr. If you are trying to use a custom evaluator, you must include its type in the `custom_evaluator_types` argument.zcase datasetz Failed to instantiate evaluator z for r  )r   rF   rk   rU   keysr   kwargsr  )r   r  r   r%  r   case_detailrV   rV   rW   r     s   r   )rF  )rG  r   rH  r   rI  rj   r}   r   r@  )rs   r\  ro   r   r]  rz   r}   r^  )rs   rt   ro   r   r   rj   r   ra   ry   rz   r{   rz   r}   r  )r  r  r}   r  r[  rZ  )r  r  r  rx  r}   rx  )r   r   r}   r  )r   r  r  rE   r   r0   r}   r   )yrS   
__future__r   _annotationsrB  r  sysra  r  r   collections.abcr   r   r   r   
contextlibr   r   contextvarsr	   dataclassesr
   r   r   pathlibr   typingr   r   r   r   r   r   r   logfire_apir   r   pydanticr   r   r   r   r   r   pydantic._internalr   pydantic_corer   pydantic_core.core_schemar   r   rich.progressr   typing_extensionsr    r!   r"   r#   pydantic_evals._utilsr$   _utilsr&   r'   r(   rO   r)   r*   evaluators._run_evaluatorr,   evaluators.commonr-   evaluators.contextr.   evaluators.evaluatorr/   evaluators.specr0   otelr1   otel._context_subtreer2   	reportingr3   r4   r5   r6   rv  r7   version_infoexceptiongroupr:   __all__Logfire_logfirer@   rB   rC   DEFAULT_DATASET_PATHrD  r   rU   _REPORT_CASES_ADAPTER_REPORT_CASE_FAILURES_ADAPTER_REPORT_CASE_AGGREGATE_ADAPTERrD   r[   r;   r<   r   rL  r  r   rj   r  r  r_  r>   r=   rd  r   r   rV   rV   rV   rW   <module>   s    	    

"@     ,
P^
&



"