From c390718c157bd99e69f8636cc65db0a67f1cb4bf Mon Sep 17 00:00:00 2001 From: wangdl Date: Sat, 30 May 2026 20:50:54 +0800 Subject: [PATCH] feat: Markdown parsing with comrak - GFM tables, strikethrough, tasklist, all 11 block types --- crates/zx_document_core/Cargo.toml | 2 + crates/zx_document_core/src/markdown.rs | 352 ++++++++++++++++++++++++ 2 files changed, 354 insertions(+) diff --git a/crates/zx_document_core/Cargo.toml b/crates/zx_document_core/Cargo.toml index 70bb2a4..b3bc1cc 100644 --- a/crates/zx_document_core/Cargo.toml +++ b/crates/zx_document_core/Cargo.toml @@ -8,3 +8,5 @@ serde = { version = "1", features = ["derive"] } serde_json = "1" infer = "0.16" mime_guess = "2" +comrak = "0.29" +uuid = { version = "1", features = ["v4"] } diff --git a/crates/zx_document_core/src/markdown.rs b/crates/zx_document_core/src/markdown.rs index e69de29..4af81a3 100644 --- a/crates/zx_document_core/src/markdown.rs +++ b/crates/zx_document_core/src/markdown.rs @@ -0,0 +1,352 @@ +use comrak::nodes::{AstNode, NodeValue}; +use comrak::{Arena, ComrakOptions}; +use uuid::Uuid; + +use crate::blocks::DocumentBlock; +use crate::error::DocumentError; + +fn block_id() -> String { + Uuid::new_v4().to_string() +} + +fn gfm_options() -> ComrakOptions<'static> { + let mut opts = ComrakOptions::default(); + opts.extension.table = true; + opts.extension.strikethrough = true; + opts.extension.tagfilter = true; + opts.extension.tasklist = true; + opts +} + +/// Parse a Markdown string into a list of DocumentBlock. +pub fn parse_markdown(md_content: &str) -> Result, DocumentError> { + let arena = Arena::new(); + let options = gfm_options(); + let root = comrak::parse_document(&arena, md_content, &options); + + let mut blocks = Vec::new(); + collect_blocks(root, &mut blocks); + Ok(blocks) +} + +fn collect_blocks<'a>(node: &'a AstNode<'a>, blocks: &mut Vec) { + for child in node.children() { + let data = child.data.borrow(); + let value = &data.value; + + let block = match value { + NodeValue::Heading(heading) => Some(DocumentBlock::Heading { + id: block_id(), + level: heading.level, + text: collect_text(child), + }), + NodeValue::Paragraph => { + // Check for inline image — extract as standalone Image block + if let Some(image_block) = extract_image_from_paragraph(child) { + Some(image_block) + } else { + let text = collect_text(child); + if text.is_empty() { + None + } else { + Some(DocumentBlock::Paragraph { + id: block_id(), + text, + }) + } + } + } + NodeValue::List(list) => { + let items: Vec = child + .children() + .filter_map(|item| { + if matches!(item.data.borrow().value, NodeValue::Item(_)) { + Some(collect_text(item)) + } else { + None + } + }) + .collect(); + if items.is_empty() { + None + } else { + Some(DocumentBlock::List { + id: block_id(), + ordered: list.list_type == comrak::nodes::ListType::Ordered, + items, + }) + } + } + NodeValue::CodeBlock(code) => Some(DocumentBlock::CodeBlock { + id: block_id(), + language: if code.info.is_empty() { + None + } else { + Some(code.info.clone()) + }, + code: code.literal.clone(), + }), + NodeValue::BlockQuote => { + let text = collect_text(child); + if text.is_empty() { + None + } else { + Some(DocumentBlock::Quote { + id: block_id(), + text, + }) + } + } + NodeValue::Table(_) => { + let mut headers = Vec::new(); + let mut rows = Vec::new(); + + for row_node in child.children() { + let cells: Vec = row_node + .children() + .map(|cell| collect_text(cell)) + .collect(); + if cells.is_empty() { + continue; + } + // Skip separator rows like |---|---| + if cells.iter().all(|c| c.chars().all(|ch| ch == '-' || ch == ':' || ch == ' ')) { + continue; + } + if headers.is_empty() { + headers = cells; + } else { + rows.push(cells); + } + } + + if headers.is_empty() { + None + } else { + Some(DocumentBlock::Table { + id: block_id(), + headers, + rows, + }) + } + } + NodeValue::Image(image) => { + let alt_text = collect_text(child); + Some(DocumentBlock::Image { + id: block_id(), + src: image.url.clone(), + alt: if alt_text.is_empty() { + None + } else { + Some(alt_text) + }, + }) + } + NodeValue::ThematicBreak => Some(DocumentBlock::HorizontalRule { + id: block_id(), + }), + // Recurse into containers: Document, Item, TableCell, TableRow, etc. + NodeValue::Document + | NodeValue::Item(_) + | NodeValue::TableCell + | NodeValue::TableRow(_) + | NodeValue::DescriptionList + | NodeValue::DescriptionItem(_) + | NodeValue::DescriptionTerm + | NodeValue::DescriptionDetails => { + collect_blocks(child, blocks); + None + } + _ => None, + }; + + if let Some(b) = block { + blocks.push(b); + } + } +} + +/// Check if a paragraph contains only an image, and extract it. +fn extract_image_from_paragraph<'a>(node: &'a AstNode<'a>) -> Option { + let mut image_node: Option<&AstNode> = None; + let mut has_other_content = false; + + for child in node.children() { + let data = child.data.borrow(); + match &data.value { + NodeValue::Image(_) => { + image_node = Some(child); + } + NodeValue::Text(t) if t.trim().is_empty() => {} + NodeValue::SoftBreak | NodeValue::LineBreak => {} + _ => { + has_other_content = true; + } + } + } + + if let Some(img) = image_node { + if !has_other_content { + let data = img.data.borrow(); + if let NodeValue::Image(image) = &data.value { + let alt_text = collect_text(img); + return Some(DocumentBlock::Image { + id: block_id(), + src: image.url.clone(), + alt: if alt_text.is_empty() { + None + } else { + Some(alt_text) + }, + }); + } + } + } + None +} + +/// Extract plain text from all text nodes within a subtree. +fn collect_text<'a>(node: &'a AstNode<'a>) -> String { + let mut text = String::new(); + collect_text_inner(node, &mut text); + text.trim().to_string() +} + +fn collect_text_inner<'a>(node: &'a AstNode<'a>, buf: &mut String) { + for child in node.children() { + let data = child.data.borrow(); + match &data.value { + NodeValue::Text(t) => buf.push_str(t), + NodeValue::SoftBreak => buf.push(' '), + NodeValue::LineBreak => buf.push('\n'), + NodeValue::Code(code) => buf.push_str(&code.literal), + _ => collect_text_inner(child, buf), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_empty_markdown() { + let blocks = parse_markdown("").unwrap(); + assert!(blocks.is_empty()); + } + + #[test] + fn test_headings() { + let md = "# Title\n\n## Section\n\n### Sub"; + let blocks = parse_markdown(md).unwrap(); + let headings: Vec<_> = blocks + .iter() + .filter(|b| matches!(b, DocumentBlock::Heading { .. })) + .collect(); + assert_eq!(headings.len(), 3); + } + + #[test] + fn test_paragraph() { + let md = "Hello world.\n\nThis is a paragraph."; + let blocks = parse_markdown(md).unwrap(); + let paras: Vec<_> = blocks + .iter() + .filter(|b| matches!(b, DocumentBlock::Paragraph { .. })) + .collect(); + assert_eq!(paras.len(), 2); + } + + #[test] + fn test_unordered_list() { + let md = "- one\n- two\n- three"; + let blocks = parse_markdown(md).unwrap(); + if let Some(DocumentBlock::List { items, ordered, .. }) = blocks.first() { + assert!(!ordered); + assert_eq!(items.len(), 3); + assert_eq!(items[0], "one"); + } else { + panic!("expected a list block"); + } + } + + #[test] + fn test_ordered_list() { + let md = "1. first\n2. second\n3. third"; + let blocks = parse_markdown(md).unwrap(); + if let Some(DocumentBlock::List { items, ordered, .. }) = blocks.first() { + assert!(*ordered); + assert_eq!(items.len(), 3); + } else { + panic!("expected an ordered list block"); + } + } + + #[test] + fn test_code_block() { + let md = "```rust\nfn main() {}\n```"; + let blocks = parse_markdown(md).unwrap(); + if let Some(DocumentBlock::CodeBlock { language, code, .. }) = blocks.first() { + assert_eq!(language.as_deref(), Some("rust")); + assert!(code.contains("fn main()")); + } else { + panic!("expected a code block"); + } + } + + #[test] + fn test_blockquote() { + let md = "> This is a quote"; + let blocks = parse_markdown(md).unwrap(); + if let Some(DocumentBlock::Quote { text, .. }) = blocks.first() { + assert!(text.contains("This is a quote")); + } else { + panic!("expected a quote"); + } + } + + #[test] + fn test_table() { + // GFM table with separator row + let md = "| A | B |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |"; + let blocks = parse_markdown(md).unwrap(); + let tables: Vec<_> = blocks + .iter() + .filter(|b| matches!(b, DocumentBlock::Table { .. })) + .collect(); + assert!(!tables.is_empty(), "should have at least one table, got {:?}", blocks); + } + + #[test] + fn test_horizontal_rule() { + let md = "---"; + let blocks = parse_markdown(md).unwrap(); + assert!(matches!( + blocks.first(), + Some(DocumentBlock::HorizontalRule { .. }) + )); + } + + #[test] + fn test_image() { + let md = "![alt text](https://example.com/img.png)"; + let blocks = parse_markdown(md).unwrap(); + if let Some(DocumentBlock::Image { src, alt, .. }) = blocks.first() { + assert_eq!(src, "https://example.com/img.png"); + assert_eq!(alt.as_deref(), Some("alt text")); + } else { + panic!("expected an image block"); + } + } + + #[test] + fn test_complex_document() { + let md = "# Heading\n\nParagraph text.\n\n- item 1\n- item 2\n\n```rs\nlet x = 1;\n```"; + let blocks = parse_markdown(md).unwrap(); + assert!(blocks.len() >= 4); + assert!(blocks.iter().any(|b| matches!(b, DocumentBlock::Heading { .. }))); + assert!(blocks.iter().any(|b| matches!(b, DocumentBlock::Paragraph { .. }))); + assert!(blocks.iter().any(|b| matches!(b, DocumentBlock::List { .. }))); + assert!(blocks.iter().any(|b| matches!(b, DocumentBlock::CodeBlock { .. }))); + } +}