
    iA?                        d Z ddlmZ ddlZddlZddlmZ  ej        e          Z	d dZ
d!dZdZddd"dZddd#dZd$dZ eddh          Zd%dZdS )&u  Sanitize tool JSON schemas for broad LLM-backend compatibility.

Some local inference backends (notably llama.cpp's ``json-schema-to-grammar``
converter used to build GBNF tool-call parsers) are strict about what JSON
Schema shapes they accept. Schemas that OpenAI / Anthropic / most cloud
providers silently accept can make llama.cpp fail the entire request with:

    HTTP 400: Unable to generate parser for this template.
    Automatic parser generation failed: JSON schema conversion failed:
    Unrecognized schema: "object"

The failure modes we've seen in the wild:

* ``{"type": "object"}`` with no ``properties`` — rejected as a node the
  grammar generator can't constrain.
* A schema value that is the bare string ``"object"`` instead of a dict
  (malformed MCP server output, e.g. ``additionalProperties: "object"``).
* ``"type": ["string", "null"]`` array types — many converters only accept
  single-string ``type``.
* ``anyOf`` / ``oneOf`` unions whose only purpose is to permit ``null`` for
  optional fields (common Pydantic/MCP shape). Anthropic rejects these at
  the top of ``input_schema``; collapse them to the non-null branch.
* Unconstrained ``additionalProperties`` on objects with empty properties.

This module walks the final tool schema tree (after MCP-level normalization
and any per-tool dynamic rebuilds) and fixes the known-hostile constructs
in-place on a deep copy. It is intentionally conservative: it only modifies
shapes the LLM backend couldn't use anyway.
    )annotationsN)Anytools
list[dict]returnc                `    | s| S g }| D ]$}|                     t          |                     %|S )uA  Return a copy of ``tools`` with each tool's parameter schema sanitized.

    Input is an OpenAI-format tool list:
    ``[{"type": "function", "function": {"name": ..., "parameters": {...}}}]``

    The returned list is a deep copy — callers can safely mutate it without
    affecting the original registry entries.
    )append_sanitize_single_tool)r   	sanitizedtools      ;/home/piyush/.hermes/hermes-agent/tools/schema_sanitizer.pysanitize_tool_schemasr   (   sK      I 6 6.t445555    r   dictc                   t          j        |           }t          |t                    r|                    d          nd}t          |t                    s|S |                    d          }t          |t                    s
di d|d<   |S t          ||                    dd                    |d<   |d         }t          |t                    s	di d|d<   nO|                    d	          dk    rd|d	<   d
|vs(t          |                    d
          t                    si |d
<   t          |d         d          |d<   t          |d         |                    dd                    |d<   |S )z9Deep-copy and sanitize a single OpenAI-format tool entry.functionN
parametersobjecttype
propertiesname<tool>pathr   r   Tkeep_nullable_hint)copydeepcopy
isinstancer   get_sanitize_nodestrip_nullable_unions_strip_top_level_combinators)r   outfnparamstops        r   r
   r
   :   s   
-

C *3 5 5	?			4Bb$ 
VVL!!Ffd## $,B??<
%f266&(3K3KLLLB|
\
Cc4   #$,B??<776??h&&"CKs""*SWW\5J5JD*Q*Q" "C -R-=RVWWWB| 4
<rvvfh77  B| Jr   )allOfanyOfoneOfenumnotr   r   r'   r   strc                   t          | t                    s| S t          |           }t          D ]8}||v r2t                              d||           |                    |d           9|S )uK  Drop combinator keywords from the top-level of a function parameters schema.

    OpenAI's Codex backend (``chatgpt.com/backend-api/codex``) is stricter
    than the public Functions API and rejects requests with::

        Invalid schema for function 'X': schema must have type 'object' and
        not have 'oneOf'/'anyOf'/'allOf'/'enum'/'not' at the top level.

    These keywords are typically used for conditional required-fields hints
    (``allOf: [{if: ..., then: {required: [...]}}]``). Removing them at the
    top level discards the hint but does not change which argument *values*
    are valid — the tool handler always re-validates required fields.

    Only the *top* level is stripped; combinators nested inside a property's
    schema are preserved (the strict rule only applies to the outermost
    parameters object).
    zcschema_sanitizer[%s]: stripped top-level %r combinator from tool parameters (strict-backend compat)N)r    r   _TOP_LEVEL_FORBIDDEN_KEYSloggerdebugpop)r'   r   r%   keys       r   r$   r$   c   sz    $ fd## 
v,,C(  #::LL?c  
 GGCJr   Tr   schemar   r   boolc                  t          | t                    rfd| D             S t          | t                    s| S fd|                                 D             }dD ]}|                    |          }t          |t                    s-d |D             }t          |          dk    rt          |          t          |          k    rut          |d         t                    rt          |d                   ni }r|                    dd           d	D ]}||v r||vr||         ||<   t          |
          c S |S )a  Collapse ``anyOf`` / ``oneOf`` nullable unions to the non-null branch.

    MCP / Pydantic optional fields commonly arrive as::

        {"anyOf": [{"type": "string"}, {"type": "null"}], "default": null}

    Anthropic's tool input-schema validator rejects the null branch. Tool
    optionality is already represented by the parent object's ``required``
    array, so we collapse the union to the single non-null variant.

    Metadata (``title``, ``description``, ``default``, ``examples``) on the
    outer union node is carried over to the replacement variant.

    Args:
        schema: JSON-Schema fragment (dict, list, or scalar).
        keep_nullable_hint: If True, set ``nullable: true`` on the replacement
            to preserve the "this field may be None" signal for downstream
            consumers that care (e.g. runtime argument coercion that maps the
            literal string ``"null"`` to Python ``None``). Anthropic's
            validator accepts ``nullable: true`` but strict producers may
            prefer False.

    Returns:
        The schema with nullable unions collapsed. Non-union nodes are
        returned unchanged.
    c                2    g | ]}t          |           S r   r#   ).0itemr   s     r   
<listcomp>z)strip_nullable_unions.<locals>.<listcomp>   s)    fffW[%d?QRRRfffr   c                :    i | ]\  }}|t          |           S r9   r:   )r;   kvr   s      r   
<dictcomp>z)strip_nullable_unions.<locals>.<dictcomp>   s>       Aq 	
 7IJJJ  r   )r*   r+   c                n    g | ]2}t          |t                    r|                    d           dk    0|3S )r   null)r    r   r!   )r;   r<   s     r   r=   z)strip_nullable_unions.<locals>.<listcomp>   sK     
 
 
tT**
/3xx/?/?6/I/I /I/I/Ir      r   nullableT)titledescriptiondefaultexamplesr   )r    listr   itemsr!   len
setdefaultr#   )r5   r   strippedr4   variantsnon_nullreplacementmeta_keys    `      r   r#   r#      s   > &$ gffff_efffffd##    LLNN  H " ] ]<<$$(D)) 	
 
%
 
 
 x==A#h--3x=="@"@/9(1+t/L/LT$x{+++RTK! 9&&z4888K ? ?x''HK,G,G,4X,>K)(I[\\\\\\Or   nodec                   t          | t                    rQ| dv r,t                              d| |            | dk    rd| indi dS t                              d|            di dS t          | t                    rfdt          |           D             S t          | t                    s| S i }|                                 D ]\  }dk    rt          |t                    rd |D             }t          |          d	k    rAt          |d
         t                    r&|d
         |d<   d|v r|	                    dd           t          d |D             d          }|r||d<   d|d<   dv r:t          |t                    r%fd|                                D             |<   dv r6t          |t                    r||<   t          | d           |<   !dv r6t          |t                    r!fdt          |          D             |<   [dv r7t          |t          t          f          rt          j        |          n||<   t          |t          t          f          rt          | d           n||<   |                    d          dk    r-t          |                    d          t                    si |d<   |                    d          dk    rt          |                    d          t                    ro|                    d          pi fd|d         D             }|s|                    dd           n+t          |          t          |d                   k    r||d<   |S )a  Recursively sanitize a JSON-Schema fragment.

    - Replaces bare-string schema values ("object", "string", ...) with
      ``{"type": <value>}`` so downstream consumers see a dict.
    - Injects ``properties: {}`` into object-typed nodes missing it.
    - Normalizes ``type: [X, "null"]`` arrays to single ``type: X`` (keeping
      ``nullable: true`` as a hint).
    - Recurses into ``properties``, ``items``, ``additionalProperties``,
      ``anyOf``, ``oneOf``, ``allOf``, and ``$defs`` / ``definitions``.
    >   rC   arraynumberr   stringbooleanintegerzGschema_sanitizer[%s]: replacing bare-string schema %r with {'type': %r}r   r   r   zMschema_sanitizer[%s]: replacing non-schema string %r with empty object schemac           	     B    g | ]\  }}t          | d | d          S )[]r"   )r;   ir<   r   s      r   r=   z"_sanitize_node.<locals>.<listcomp>   s3    TTTDt^^q^^^44TTTr   c                    g | ]
}|d k    |S )rC    r;   ts     r   r=   z"_sanitize_node.<locals>.<listcomp>   s    888aAKKKKKr   rD   r   rC   rE   Tc              3  P   K   | ]!}t          |t                    |d k    |V  "dS )rC   Nr    r.   ra   s     r   	<genexpr>z!_sanitize_node.<locals>.<genexpr>   s5      UUA*Q2D2DUfaUUr   N>   $defsr   definitionsc                H    i | ]\  }}|t          | d  d |           S ).r]   )r;   sub_ksub_vr4   r   s      r   rA   z"_sanitize_node.<locals>.<dictcomp>   sO        E5 ~e-D-Ds-D-DU-D-DEE  r   >   rK   additionalPropertiesri   >   r)   r*   r+   c                H    g | ]\  }}t          | d  d| d          S )ri   r[   r\   r]   )r;   r^   r<   r4   r   s      r   r=   z"_sanitize_node.<locals>.<listcomp>
  sP       At t%9%9s%9%9Q%9%9%9::  r   >   r,   rI   requiredr   rn   c                F    g | ]}t          |t                    |v |S r`   rd   )r;   rpropss     r   r=   z"_sanitize_node.<locals>.<listcomp>$  s-    QQQqz!S/A/AQa5jjjjjr   )r    r.   r1   r2   rJ   	enumerater   rK   rL   rM   nextr6   r"   r   r   r!   r3   )	rS   r   r%   valuerP   	first_strvalidr4   rq   s	    `     @@r   r"   r"      s(    $ 4XXXLL$dD  
 &*X%5%5FD>>  < <  	'(,d	
 	
 	
 !333$ UTTTTIdOOTTTTdD!! Cjjll 0l 0l
U &==Zt44=885888H8}}!!j!c&B&B!&qkFU??NN:t444UUUUUW[\\I 'F"CK888Zt=T=T8    $)KKMM  CHH 555%&& B !C)%D3AAC///Jud4K4K/    (//  CHH 444 0:%$/N/NYt}U+++TYCHHAKETXZ^S_A`A`k~e__s__===fkCHH wwv("":cggl6K6KT+R+R"L
 wwv(""z#''*2E2Et'L'L"%%+QQQQC
OQQQ 	$GGJ%%%%ZZ3s:////#C
OJr   patternformattuple[list[dict], int]c                \   | s| dfS ddfd| D ]x}t          |t                    r|                    d          nd}t          |t                    r5|                    d	          }t          |t                    r |           yrt                              d
           | fS )u/  Strip ``pattern`` and ``format`` JSON Schema keywords from tool schemas.

    This is a *reactive* sanitizer invoked only when llama.cpp's
    ``json-schema-to-grammar`` converter has rejected a tool schema with an
    HTTP 400 grammar-parse error.  llama.cpp's regex engine supports only a
    small subset of ECMAScript regex (literals, ``.``, ``[...]``, ``|``,
    ``*``, ``+``, ``?``, ``{n,m}``) — it rejects escape classes like ``\d``,
    ``\w``, ``\s`` and most ``format`` values.  Cloud providers (OpenAI,
    Anthropic, OpenRouter, Gemini) accept these keywords fine and rely on
    them as prompting hints, so we keep them in the default schema and only
    strip on demand.

    The strip operates on a sibling of ``type`` (so schema keywords are
    removed) — a property literally *named* ``pattern`` (e.g. the first arg
    of the built-in ``search_files`` tool) is not affected because property
    names live in the ``properties`` dict, not as siblings of ``type``.

    Args:
        tools: OpenAI-format tool list, mutated in place for efficiency.
            Callers that need to preserve the original should deep-copy first.

    Returns:
        ``(tools, stripped_count)`` — the same list reference plus a count of
        how many ``pattern``/``format`` keywords were removed across all tools.
    r   rS   r   r   Nonec                \   t          | t                    rnd| v pd| v pd| v pd| v }t          |                                           D ]:}|r%|t          v r|                     |d            dz  ) | |                    ;d S t          | t                    r| D ]} |           d S d S )Nr   r*   r+   r)   rD   )r    r   rJ   keys_STRIP_ON_RECOVERY_KEYSr3   )rS   is_schema_noder4   r<   _walkrN   s       r   r   z'strip_pattern_and_format.<locals>._walkS  s    dD!! 	
 $t^dw$d'T/dU\`dUdNDIIKK(( ! !! c-D&D&DHHS$'''MHd3i    ! ! d## 	  d	 	 r   r   Nr   zlschema_sanitizer: stripped %d pattern/format keyword(s) from tool schemas (llama.cpp grammar-parse recovery))rS   r   r   r{   )r    r   r!   r1   info)r   r   r&   r'   r   rN   s       @@r   strip_pattern_and_formatr   4  s    4  axH      $   %/d%;%;ETXXj!!!b$ 	VVL))F&$'' f 
>	
 	
 	

 (?r   )r   r   r   r   )r   r   r   r   )r'   r   r   r.   r   r   )r5   r   r   r6   r   r   )rS   r   r   r.   r   r   )r   r   r   ry   )__doc__
__future__r   r   loggingtypingr   	getLogger__name__r1   r   r
   r0   r$   r#   r"   	frozensetr~   r   r`   r   r   <module>r      s"   < # " " " " "        		8	$	$   $# # # #L G  ?G      F  $; ; ; ; ; ;|i i i i` $)Y$9:: > > > > > >r   