markdown = mb_convert_encoding($markdown, 'UTF-8'); } /** * Format the markdown content by applying various formatting rules. * * @return string The formatted markdown. */ public function format(): string { if (empty($this->markdown)) { return $this->markdown; } $blocks = $this->segmentMarkdown(); $processedBlocks = array_map(function ($block) { $type = $this->determineBlockType($block); return $this->formatBlock($block, $type); }, $blocks); return implode("\n\n", $processedBlocks); } /** * Segment the markdown into blocks based on empty lines, respecting code blocks and multi-line HTML. * * @return array Array of blocks, each containing markdown content. */ private function segmentMarkdown(): array { $blocks = []; $currentBlock = ''; $lines = explode("\n", $this->markdown); $inCodeBlock = false; foreach ($lines as $line) { if (preg_match('/^```/', trim($line))) { if ($inCodeBlock) { // End of a code block $currentBlock .= $line . "\n"; $blocks[] = $currentBlock; $currentBlock = ''; $inCodeBlock = false; } else { // Start of a code block if (!empty($currentBlock)) { $blocks[] = $currentBlock; $currentBlock = ''; } $inCodeBlock = true; $currentBlock .= $line . "\n"; } } elseif ($inCodeBlock) { // Inside a code block $currentBlock .= $line . "\n"; } else { // Normal line processing if (trim($line) === '' && trim($currentBlock) !== '') { $blocks[] = $currentBlock; $currentBlock = ''; } else { $currentBlock .= $line . "\n"; } } } // Add the last block if not empty if (!empty(trim($currentBlock))) { $blocks[] = $currentBlock; } return $blocks; } /** * Determine the type of a markdown block. * * @param string $block The markdown block to analyze. * @return string The type of the block. */ private function determineBlockType(string $block): string { if (preg_match('/^\s*```/', trim($block))) { return 'code'; } if (preg_match('/^\s*<[^>]+>/', trim($block))) { return 'html'; } if (preg_match('/^\s*#/', trim($block))) { return 'header'; } if (preg_match('/^\s*\|/', trim($block))) { return 'table'; } if (preg_match('/^\s*>\s/', trim($block))) { return 'blockquote'; } if ( preg_match('/^\s*-\s/', trim($block)) || preg_match('/^\s*\d+\.\s/', trim($block)) ) { return 'list'; } if (preg_match('/^\s*\[\^[\w-]+\]:/', trim($block))) { return 'footnote'; } return 'paragraph'; // Default to paragraph if no other type matches } /** * Apply formatting rules to a single markdown block based on its type. * * @param string $block The markdown block to format. * @param string $type The type of the block. * @return string The formatted block. */ private function formatBlock(string $block, string $type): string { $block = trim($block, "\n"); switch ($type) { case 'code': return $this->formatCodeBlock($block); case 'html': return $this->formatHtmlBlock($block); case 'header': return $this->formatHeaderBlock($block); case 'table': return $this->formatTableBlock($block); case 'blockquote': return $this->formatBlockquoteBlock($block); case 'list': return $this->formatListBlock($block); case 'footnote': return $this->formatFootnoteBlock($block); default: return $this->formatParagraphBlock($block); } } private function formatCodeBlock(string $block): string { // Split the block into lines $lines = explode("\n", $block); // Clean the first line if it starts with ``` if (count($lines) > 0 && preg_match('/^```/', trim($lines[0]))) { $lines[0] = preg_replace('/^(```\w*)\s*{.*?}$/', '$1', trim($lines[0])); } $last = count($lines) - 1; $lines[$last] = '```'; // Reassemble the block return implode("\n", $lines); } private function formatHtmlBlock(string $block): string { // HTML-specific formatting return $block; } private function formatFootnoteBlock(string $block): string { // HTML-specific formatting return $block; } private function formatHeaderBlock(string $block): string { // Header-specific formatting return $this->replaceUnderscoresWithAsterisks($block); } private function formatTableBlock(string $block): string { // HTML-specific formatting return $block; } private function formatBlockquoteBlock(string $block): string { // Blockquote-specific formatting return $block; } private function formatListBlock(string $block): string { // List-specific formatting return $block; } /** * Apply formatting rules to a paragraph block. * * @param string $block The paragraph block to format. * @return string The formatted paragraph block. */ private function formatParagraphBlock(string $block): string { // Normalize three dots and variants to the ellipsis character $block = preg_replace('/\.{3}(?!\.)/', '…', $block); // Remove unnecessary new lines within the paragraph $block = str_replace("\n", ' ', $block); // Normalize spaces (replace multiple spaces with a single space) $block = preg_replace('/\s+/', ' ', $block); // Avoid adding space in markdown links by temporarily replacing them preg_match_all('/\[[^\]]+\]\([^\)]+\)/', $block, $links); foreach ($links[0] as $index => $link) { $block = str_replace($link, "link_placeholder_{$index}", $block); } // Add space after punctuation $block = preg_replace('/(\S)([.!?…])(\s|$)/', '$1$2 ', $block); // Restore links foreach ($links[0] as $index => $link) { $block = str_replace("link_placeholder_{$index}", $link, $block); } $delimiter = sprintf('/(?<=[%s])\s+/u', implode('', array_map('preg_quote', $this->phraseEndingChars))); $sentences = preg_split($delimiter, $block, -1, PREG_SPLIT_NO_EMPTY); $sentences = array_map(function ($sentence) { // Replace underscores by asterisks when they are used as pairs and not part of markdown links $sentence = $this->replaceUnderscoresWithAsterisks($sentence); return trim($sentence); }, $sentences); // Join sentences by new lines $formattedParagraph = implode("\n", $sentences); return $formattedParagraph; } /** * Replace underscores with asterisks when used in pairs, not affecting markdown links. * * @param string $sentence The sentence to process. * @return string The processed sentence. */ private function replaceUnderscoresWithAsterisks(string $sentence): string { // Temporarily remove Markdown links to avoid processing underscores within them $patterns = [ '/\[[^\]]+\]\([^\)]+\)/', // Match links of the form [text](link) '/<[^>]+>/', // Match links of the form '/\[\^[^\]]+\]/', // Match footnote references of the form [^footnote] ]; $links = []; foreach ($patterns as $pattern) { preg_match_all($pattern, $sentence, $matches); foreach ($matches[0] as $index => $match) { // Store the link with a unique placeholder $placeholder = sprintf('link-placeholder-%d-%d', count($links), $index); $links[$placeholder] = $match; $sentence = str_replace($match, $placeholder, $sentence); } } // Replace all non-link underscore pairs $sentence = preg_replace_callback('/(_[^_]+_)/', function ($matches) { // Replace underscores with asterisks, but keep the content return str_replace('_', '*', $matches[0]); }, $sentence); // Restore the links foreach ($links as $placeholder => $link) { $sentence = str_replace($placeholder, $link, $sentence); } return $sentence; } }