"""Markdown → Lark post-AST converter. Covers the common subset: headings, bold, italic, inline code, code blocks, links, bullet / numbered lists, blockquotes, , and horizontal rules. Unsupported syntax falls back to plain text, preserving readability. This is a pragmatic converter — not a full CommonMark parser — because the Lark post AST only supports a limited vocabulary. Lines that don't match a recognized pattern are emitted as single `text` runs with inline markers stripped. """ import re from typing import Any, Dict, List, Tuple from ...types import Identity # Inline marker patterns (order matters: most specific first). # Italic is tricky because `**bold**` also matches `*...*`. We require that # the italic delimiter is not adjacent to another `*` (or `_`). _INLINE_PATTERNS = [ (re.compile(r"\*\*(.+?)\*\*"), "bold"), (re.compile(r"__(.+?)__"), "bold"), (re.compile(r"(?]+)>([^<]*)", re.IGNORECASE) _ATTR_RE = re.compile(r'([\w_-]+)\s*=\s*"([^"]*)"') def _parse_inline_runs(line: str) -> List[Dict[str, Any]]: """Return list of text runs with style attributes for a single line.""" if not line: return [] # Collect disjoint match ranges (bold / italic / code / link / at), # non-overlapping, leftmost-first. Whatever is left is plain text. matches: List[Tuple[int, int, Dict[str, Any]]] = [] for pat, style in _INLINE_PATTERNS: for m in pat.finditer(line): matches.append((m.start(), m.end(), {"tag": "text", "text": m.group(1), "style": [style]})) for m in _LINK_RE.finditer(line): matches.append((m.start(), m.end(), {"tag": "a", "text": m.group(1), "href": m.group(2)})) for m in _AT_RE.finditer(line): attrs = dict(_ATTR_RE.findall(m.group(1))) uid = attrs.get("user_id") or attrs.get("open_id") or "" name = m.group(2) or attrs.get("user_name") or uid node: Dict[str, Any] = {"tag": "at"} if uid: node["user_id"] = uid if name: node["user_name"] = name matches.append((m.start(), m.end(), node)) matches.sort(key=lambda t: (t[0], t[1])) # Remove overlapping matches (keep the earliest; drop overlaps). filtered: List[Tuple[int, int, Dict[str, Any]]] = [] cursor = 0 for s, e, node in matches: if s < cursor: continue filtered.append((s, e, node)) cursor = e # Interleave with plain-text runs. out: List[Dict[str, Any]] = [] pos = 0 for s, e, node in filtered: if s > pos: out.append({"tag": "text", "text": line[pos:s]}) out.append(node) pos = e if pos < len(line): out.append({"tag": "text", "text": line[pos:]}) return [r for r in out if r.get("text") or r.get("tag") == "at" or r.get("tag") == "a"] def _plain(text: str) -> List[Dict[str, Any]]: return [{"tag": "text", "text": text}] if text else [] def markdown_to_post_ast( md: str, title: str = "", locale: str = "zh_cn", mentions: "list[Identity] | None" = None, table_mode: str = "off", tag_md_mode: str = "structured", ) -> Dict[str, Any]: """Produce a Lark post AST (`{locale: {title, content: [[...]]}}`) from Markdown. Mentions supplied via `mentions` are appended (inline @tags) to the first paragraph so the recipient actually gets notified. ``tag_md_mode``: - ``"structured"`` (default): parse Markdown into explicit post nodes (``tag:text`` with style attributes, ``tag:a`` for links, ``tag:code_block`` for fenced code, etc). Cross-client deterministic. - ``"native"``: wrap the raw markdown into one or more ``tag:md`` rows (split at code-fence boundaries) and let the Feishu client's own markdown parser render natively. Renders headers/blockquotes/lists with native styling, but rendering depends on Feishu client version. """ if tag_md_mode == "native": return _build_native_md_ast(md, title=title, locale=locale, mentions=mentions) lines = (md or "").splitlines() paragraphs: List[List[Dict[str, Any]]] = [] i = 0 n = len(lines) def _flush_paragraph(buf: List[str]) -> None: if not buf: return para_text = "\n".join(buf).strip() if not para_text: return paragraphs.append(_parse_inline_runs(para_text)) buf.clear() buf: List[str] = [] while i < n: line = lines[i] stripped = line.strip() # Horizontal rule if re.fullmatch(r"-{3,}|\*{3,}|_{3,}", stripped or ""): _flush_paragraph(buf) paragraphs.append([{"tag": "hr"}]) i += 1 continue # ATX heading # m = re.match(r"^(#{1,6})\s+(.*)", line) if m: _flush_paragraph(buf) heading_text = m.group(2).strip() run = _parse_inline_runs(heading_text) for r in run: if r.get("tag") == "text": styles = r.setdefault("style", []) if "bold" not in styles: styles.append("bold") paragraphs.append(run or _plain(heading_text)) i += 1 continue # Fenced code block m = re.match(r"^```(\w*)\s*$", line) if m: _flush_paragraph(buf) lang = m.group(1) or "" code_lines: List[str] = [] i += 1 while i < n and not re.match(r"^```\s*$", lines[i]): code_lines.append(lines[i]) i += 1 if i < n: i += 1 # skip closing fence paragraphs.append( [ { "tag": "code_block", "language": lang.upper() if lang else "TEXT", "text": "\n".join(code_lines), } ] ) continue # Blockquote if stripped.startswith(">"): _flush_paragraph(buf) quote_lines: List[str] = [] while i < n and lines[i].lstrip().startswith(">"): quote_lines.append(re.sub(r"^\s*>\s?", "", lines[i])) i += 1 quote_text = "\n".join(quote_lines) run = _parse_inline_runs(quote_text) # Lark post has no native blockquote; prefix each line with │ so it's # visually distinguishable. paragraphs.append([{"tag": "text", "text": "│ "}] + run) continue # Bullet list / ordered list: emit each bullet as its own paragraph. if re.match(r"^\s*[-*+]\s+", line) or re.match(r"^\s*\d+[.)]\s+", line): _flush_paragraph(buf) while i < n and ( re.match(r"^\s*[-*+]\s+", lines[i]) or re.match(r"^\s*\d+[.)]\s+", lines[i]) ): item_line = lines[i] bullet_text = re.sub(r"^\s*[-*+]\s+", "• ", item_line) bullet_text = re.sub(r"^\s*\d+[.)]\s+", "\\g<0>", bullet_text) paragraphs.append(_parse_inline_runs(bullet_text)) i += 1 continue # Table: depend on table_mode setting. if _looks_like_table_row(line): _flush_paragraph(buf) table_lines: List[str] = [line] i += 1 while i < n and _looks_like_table_row(lines[i]): table_lines.append(lines[i]) i += 1 converted = _convert_table(table_lines, mode=table_mode) paragraphs.extend(converted) continue if not stripped: _flush_paragraph(buf) i += 1 continue buf.append(line) i += 1 _flush_paragraph(buf) # Inject @mentions at the start of the first paragraph so recipients are # notified. if mentions and paragraphs: at_runs: List[Dict[str, Any]] = [] for ident in mentions: if not ident or not ident.open_id: continue at_runs.append( { "tag": "at", "user_id": ident.open_id, "user_name": ident.display_name or "", } ) at_runs.append({"tag": "text", "text": " "}) paragraphs[0] = at_runs + paragraphs[0] if not paragraphs: paragraphs = [_plain("")] return {locale: {"title": title or "", "content": paragraphs}} def _looks_like_table_row(line: str) -> bool: stripped = (line or "").strip() return stripped.count("|") >= 2 and stripped.startswith("|") and stripped.endswith("|") def _convert_table(lines: List[str], mode: str) -> List[List[Dict[str, Any]]]: """Convert a Markdown table into paragraphs based on `mode`.""" if mode == "off": return [_parse_inline_runs(ln) for ln in lines] # Drop the separator row (---|---). rows = [ [c.strip() for c in ln.strip().strip("|").split("|")] for ln in lines if not re.match(r"^\s*\|?[\s:\-|]+\|?\s*$", ln) ] if not rows: return [_parse_inline_runs(ln) for ln in lines] if mode == "bullets": # "col1: val1 · col2: val2" per row, excluding the header row. header = rows[0] out: List[List[Dict[str, Any]]] = [] for row in rows[1:]: pairs = [f"{header[i] if i < len(header) else ''}: {c}" for i, c in enumerate(row)] out.append([{"tag": "text", "text": "• " + " · ".join(pairs)}]) return out or [_parse_inline_runs(lines[0])] if mode == "code": src = "\n".join(lines) return [[{"tag": "code_block", "language": "TEXT", "text": src}]] # mode == "table" if mode == "table": rendered = [] for row in rows: rendered.append([{"tag": "text", "text": " | ".join(row)}]) return rendered # default fallback return [_parse_inline_runs(ln) for ln in lines] _FENCE_LINE_RE = re.compile(r"^```") def _build_native_md_ast( md: str, title: str = "", locale: str = "zh_cn", mentions: "list[Identity] | None" = None, ) -> Dict[str, Any]: """Pack raw markdown into one or more ``tag:md`` rows. Each fenced code block is its own row; prose between fences is its own row. This mirrors known Feishu client behavior (md content immediately after a code fence may be swallowed otherwise). Mentions are attached to the first row (consistent with structured mode). """ segments = _split_at_code_fences(md or "") rows: List[List[Dict[str, Any]]] = [ [{"tag": "md", "text": seg}] for seg in segments ] if mentions and rows: at_runs: List[Dict[str, Any]] = [] for ident in mentions: if not ident or not ident.open_id: continue at_runs.append({ "tag": "at", "user_id": ident.open_id, "user_name": ident.display_name or "", }) at_runs.append({"tag": "text", "text": " "}) rows[0] = at_runs + rows[0] return {locale: {"title": title or "", "content": rows}} def _split_at_code_fences(text: str) -> List[str]: """Split markdown at fenced-code-block boundaries. Returns a list of non-empty segments where each segment is either a prose block (no fence lines) or one complete fenced code block (open fence + body + close fence). Unclosed fences are absorbed into the trailing segment as-is — no content is dropped. Distinct from ``splitter.split_with_code_fences``: this one is length-agnostic and never inserts synthetic close/reopen fences. """ if not text: return [] lines = text.split("\n") out: List[str] = [] buf: List[str] = [] in_fence = False def _flush(): if buf: out.append("\n".join(buf)) buf.clear() for line in lines: is_fence = bool(_FENCE_LINE_RE.match(line)) if not in_fence and is_fence: # Closing prose, opening fence _flush() buf.append(line) in_fence = True elif in_fence and is_fence: # Closing fence buf.append(line) _flush() in_fence = False else: buf.append(line) _flush() return [s for s in out if s != ""]