
    ih,                        d Z ddlmZ ddlZddlZddlZddlmZ ddlm	Z	m
Z
mZmZmZ  ej        e          Z eh d          Zd$dZd%dZd&dZd'dZd(dZd)d*dZd+dZd,d!Zd"d#gZdS )-u  Routing helpers for inbound user-attached images.

Two modes:

  native  — attach images as OpenAI-style ``image_url`` content parts on the
            user turn. Provider adapters (Anthropic, Gemini, Bedrock, Codex,
            OpenAI chat.completions) already translate these into their
            vendor-specific multimodal formats.

  text    — run ``vision_analyze`` on each image up-front and prepend the
            description to the user's text. The model never sees the pixels;
            it only sees a lossy text summary. This is the pre-existing
            behaviour and still the right choice for non-vision models.

The decision is made once per message turn by :func:`decide_image_input_mode`.
It reads ``agent.image_input_mode`` from config.yaml (``auto`` | ``native``
| ``text``, default ``auto``) and the active model's capability metadata.

In ``auto`` mode:
  - If the user has explicitly configured ``auxiliary.vision.provider``
    (i.e. not ``auto`` and not empty), we assume they want the text pipeline
    regardless of the main model — they've opted in to a specific vision
    backend for a reason (cost, quality, local-only, etc.).
  - Otherwise, if the active model reports ``supports_vision=True`` in its
    models.dev metadata, we attach natively.
  - Otherwise (non-vision model, no explicit override), we fall back to text.

This keeps ``vision_analyze`` surfaced as a tool in every session — skills
and agent flows that chain it (browser screenshots, deeper inspection of
URL-referenced images, style-gating loops) keep working. The routing only
affects *how user-attached images on the current turn* are presented to the
main model.
    )annotationsN)Path)AnyDictListOptionalTuple>   autotextnativerawr   returnstrc                    t          | t                    sdS |                                                                 }|t          v r|S dS )z5Normalize a config value into one of the valid modes.r
   )
isinstancer   striplower_VALID_MODES)r   vals     8/home/piyush/.hermes/hermes-agent/agent/image_routing.py_coerce_moder   1   sG    c3 v
))++



C
l
6    cfgOptional[Dict[str, Any]]boolc                h   t          | t                    sdS |                     d          pi }t          |t                    sdS |                    d          pi }t          |t                    sdS t          |                    d          pd                                                                          }t          |                    d          pd                                          }t          |                    d          pd                                          }|dv r|s|sdS d	S )
zTrue when the user configured a specific auxiliary vision backend.

    An explicit override means the user *wants* the text pipeline (they're
    paying for a dedicated vision model), so we don't silently bypass it.
    F	auxiliaryvisionprovider modelbase_url)r    r
   T)r   dictgetr   r   r   )r   auxr   r   r!   r"   s         r   _explicit_aux_vision_overrider&   ;   s     c4   u
''+


$"Cc4   uWWX$"Ffd## u6::j))/R006688>>@@H

7##)r**0022E6::j))/R006688H <hu4r   r   r!   Optional[bool]c                    | r|sdS 	 ddl m}  || |          }n5# t          $ r(}t                              d| ||           Y d}~dS d}~ww xY w|dS t          |j                  S )z:Return True/False if we can resolve caps, None if unknown.Nr   )get_model_capabilitiesu2   image_routing: caps lookup failed for %s:%s — %s)agent.models_devr)   	Exceptionloggerdebugr   supports_vision)r   r!   r)   capsexcs        r   _lookup_supports_visionr1   T   s     5 t;;;;;;%%h66   I8UZ\_```ttttt |t$%%%s    
AAAc                >   d}t          |t                    rN|                    d          pi }t          |t                    r"t          |                    d                    }|dk    rdS |dk    rdS t	          |          rdS t          | |          }|du rdS dS )a1  Return ``"native"`` or ``"text"`` for the given turn.

    Args:
      provider: active inference provider ID (e.g. ``"anthropic"``, ``"openrouter"``).
      model:    active model slug as it would be sent to the provider.
      cfg:      loaded config.yaml dict, or None. When None, behaves as auto.
    r
   agentimage_input_moder   r   T)r   r#   r$   r   r&   r1   )r   r!   r   mode_cfg	agent_cfgsupportss         r   decide_image_input_moder8   c   s     H#t GGGG$$*	i&& 	G#IMM2D$E$EFFH8x6v %S)) v&x77H4x6r   bytesOptional[str]c                t   | sdS |                      d          rdS |                      d          rdS | dd         dv rdS t          |           d	k    r| dd
         dk    r| dd	         dk    rdS |                      d          rdS t          |           d	k    r| d
d         dk    r| dd	         dv rdS dS )a  Detect image MIME from magic bytes. Returns None if unrecognised.

    Filename-based detection (``mimetypes.guess_type``) is unreliable when
    upstream platforms lie about content-type. Discord, for example, can
    serve a PNG with ``content_type=image/webp`` for proxied/animated
    stickers, custom emoji previews, or images uploaded via certain bots.
    Anthropic strictly validates that declared media_type matches the
    actual bytes and returns HTTP 400 on mismatch, so we sniff to be safe.
    Ns   PNG

	image/pngs   
image/jpeg   )s   GIF87as   GIF89a	image/gif      s   RIFF   s   WEBP
image/webps   BM	image/bmps   ftyp)s   heics   heixs   hevcs   hevxs   mif1s   msf1s   heims   heisz
image/heic)
startswithlenr   s    r   _sniff_mime_from_bytesrH      s      t
~~*++ {
~~o&& |
2A2w((({
3xx2~~#bqb'W,,QrTg1E1E|
~~e {
3xx2~~#ac(g--#ad) @ 3 3 |4r   pathr   Optional[bytes]c                   |t          |          }|r|S t          j        t          |                     \  }}|r|                    d          r|S | j                                        }ddddddd                    |d          S )	zReturn image MIME type for *path*.

    If *raw* bytes are provided, magic-byte sniffing wins (authoritative).
    Otherwise we fall back to ``mimetypes`` then suffix-based defaults.
    Nzimage/r=   r<   r?   rC   rD   )z.jpgz.jpegz.pngz.gifz.webpz.bmp)rH   	mimetypes
guess_typer   rE   suffixr   r$   )rI   r   sniffedmime_rN   s         r   _guess_mimerR      s     (-- 	N"3t99--GD! ))  [  F  
c&, r   c                   	 |                                  }n4# t          $ r'}t                              d| |           Y d}~dS d}~ww xY wt	          | |          }t          j        |                              d          }d| d| S )u!  Encode a local image as a base64 data URL at its native size.

    Size limits are NOT enforced here — the agent retry loop
    (``run_agent._try_shrink_image_parts_in_messages``) shrinks on the
    provider's first rejection. Keeping this simple means providers that
    accept large images (OpenAI 49 MB+, Gemini 100 MB) don't pay a silent
    quality tax just because one other provider is stricter.

    Returns None only if the file can't be read (missing, permission
    denied, etc.); the caller reports those paths in ``skipped``.
    u'   image_routing: failed to read %s — %sNrG   asciizdata:z;base64,)
read_bytesr+   r,   warningrR   base64	b64encodedecode)rI   r   r0   rP   b64s        r   _file_to_data_urlr[      s    oo   @$LLLttttt t%%%D

3


&
&w
/
/C&4&&&&&s    
AAA	user_textimage_paths	List[str]&Tuple[List[Dict[str, Any]], List[str]]c                   g }g }g }|D ]}t          |          }|                                r|                                s#|                    t	          |                     \t          |          }|s#|                    t	          |                     |                    dd|id           |                    t	          |                     | pd                                }|rI|pd}	d                    d |D                       }
|	 d|
 }d	|d
g}|                    |           ||fS g }|r|                    d	|d
           ||fS )u  Build an OpenAI-style ``content`` list for a user turn.

    Shape:
      [{"type": "text", "text": "...\n\n[Image attached at: /local/path]"},
       {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}},
       ...]

    The local path of each successfully attached image is appended to the
    text part as ``[Image attached at: <path>]``. The model still sees the
    pixels via the ``image_url`` part (full native vision); the path note
    just gives it a string handle so MCP/skill tools that take an image
    path or URL argument can be invoked on the same image without an
    extra round-trip. This parallels the text-mode hint produced by
    ``Runner._enrich_message_with_vision`` (``vision_analyze using image_url:
    <path>``) so behaviour is consistent across both image input modes.

    Images are attached at their native size. If a provider rejects the
    request because an image is too large (e.g. Anthropic's 5 MB per-image
    ceiling), the agent's retry loop transparently shrinks and retries
    once — see ``run_agent._try_shrink_image_parts_in_messages``.

    Returns (content_parts, skipped_paths). Skipped paths are files that
    couldn't be read from disk and are NOT advertised in the path hints.
    	image_urlurl)typera   r    zWhat do you see in this image?
c              3  "   K   | ]
}d | dV  dS )z[Image attached at: ]N ).0ps     r   	<genexpr>z-build_native_content_parts.<locals>.<genexpr>  s?       
 
,-'1'''
 
 
 
 
 
r   z

r   )rc   r   )	r   existsis_fileappendr   r[   r   joinextend)r\   r]   skippedimage_partsattached_pathsraw_pathri   data_urlr   	base_text
path_hintscombined_textpartss                r   build_native_content_partsry      s   8 G(*K "N - -NNxxzz 	 	NN3x==)))$Q'' 	NN3x==)))*
 
 	 	 	 	c(mm,,,,O""$$D  <<	YY 
 
1?
 
 
 
 

 %66*6606'N'N&O[!!!g~ E 5fd33444'>r   r8   ry   )r   r   r   r   )r   r   r   r   )r   r   r!   r   r   r'   )r   r   r!   r   r   r   r   r   )r   r9   r   r:   )N)rI   r   r   rJ   r   r   )rI   r   r   r:   )r\   r   r]   r^   r   r_   )__doc__
__future__r   rW   loggingrL   pathlibr   typingr   r   r   r   r	   	getLogger__name__r,   	frozensetr   r   r&   r1   r8   rH   rR   r[   ry   __all__rg   r   r   <module>r      sg     D # " " " " "             3 3 3 3 3 3 3 3 3 3 3 3 3 3		8	$	$ y33344      2& & & &   `       F         4' ' ' ',A A A AJ  r   