329 lines
10 KiB
PHP
329 lines
10 KiB
PHP
<?php
|
|
|
|
namespace App\Services\Markdown;
|
|
|
|
/**
|
|
* Linter class is responsible for formatting markdown content.
|
|
*/
|
|
class Linter
|
|
{
|
|
/**
|
|
* The markdown content to be formatted.
|
|
*
|
|
* @var string
|
|
*/
|
|
private $markdown;
|
|
|
|
/**
|
|
* Characters that mark the end of a sentence.
|
|
*
|
|
* @var array
|
|
*/
|
|
private $phraseEndingChars = ['.', '!', '?'];
|
|
|
|
private $htmlPlaceholders = [];
|
|
|
|
/**
|
|
* Constructor takes markdown content and prepares it for formatting.
|
|
*
|
|
* @param string $markdown Markdown content to format.
|
|
*/
|
|
public function __construct(?string $markdown = '')
|
|
{
|
|
$this->markdown = mb_convert_encoding($markdown, 'UTF-8');
|
|
}
|
|
|
|
/**
|
|
* Format the markdown content by applying various formatting rules.
|
|
*
|
|
* @return string The formatted markdown.
|
|
*/
|
|
public function format(): string
|
|
{
|
|
return $this->markdown;
|
|
$this->markdown = $this->replaceHtmlWithPlaceholders($this->markdown);
|
|
|
|
if (empty($this->markdown)) {
|
|
return $this->markdown;
|
|
}
|
|
|
|
$blocks = $this->segmentMarkdown();
|
|
|
|
$processedBlocks = array_map(function ($block) {
|
|
$type = $this->determineBlockType($block);
|
|
|
|
return $this->formatBlock($block, $type);
|
|
}, $blocks);
|
|
|
|
$result = implode("\n\n", $processedBlocks);
|
|
$result = $this->restoreHtmlFromPlaceholders($result);
|
|
|
|
return $result;
|
|
}
|
|
|
|
private function restoreHtmlFromPlaceholders($text)
|
|
{
|
|
return str_replace(array_keys($this->htmlPlaceholders), array_values($this->htmlPlaceholders), $text);
|
|
}
|
|
|
|
private function replaceHtmlWithPlaceholders($text)
|
|
{
|
|
$index = 0;
|
|
|
|
return preg_replace_callback('/<([a-zA-Z0-9\-]+)([^>]*)>(.*?)<\/\1>/s', function ($matches) use (&$index) {
|
|
$placeholder = "<!-- html-placeholder-{$index} -->";
|
|
$this->htmlPlaceholders[$placeholder] = $matches[0]; // Store the whole match
|
|
$index++;
|
|
|
|
return $placeholder;
|
|
}, $text);
|
|
}
|
|
|
|
private function segmentMarkdown(): array
|
|
{
|
|
$blocks = [];
|
|
$currentBlock = '';
|
|
$lines = explode("\n", $this->markdown);
|
|
$inCodeBlock = false;
|
|
|
|
foreach ($lines as $line) {
|
|
if (preg_match('/^```/', trim($line))) {
|
|
if ($inCodeBlock) {
|
|
// End of a code block
|
|
$currentBlock .= $line . "\n";
|
|
$blocks[] = $currentBlock;
|
|
$currentBlock = '';
|
|
$inCodeBlock = false;
|
|
} else {
|
|
// Start of a code block
|
|
if (!empty($currentBlock)) {
|
|
$blocks[] = $currentBlock;
|
|
$currentBlock = '';
|
|
}
|
|
$inCodeBlock = true;
|
|
$currentBlock .= $line . "\n";
|
|
}
|
|
} elseif ($inCodeBlock) {
|
|
// Inside a code block
|
|
$currentBlock .= $line . "\n";
|
|
} else {
|
|
// Normal line processing
|
|
if (trim($line) === '' && trim($currentBlock) !== '') {
|
|
$blocks[] = $currentBlock;
|
|
$currentBlock = '';
|
|
} else {
|
|
$currentBlock .= $line . "\n";
|
|
}
|
|
}
|
|
}
|
|
|
|
// Add the last block if not empty
|
|
if (!empty(trim($currentBlock))) {
|
|
$blocks[] = $currentBlock;
|
|
}
|
|
|
|
return $blocks;
|
|
}
|
|
|
|
/**
|
|
* Determine the type of a markdown block.
|
|
*
|
|
* @param string $block The markdown block to analyze.
|
|
* @return string The type of the block.
|
|
*/
|
|
private function determineBlockType(string $block): string
|
|
{
|
|
if (preg_match('/^\s*```/', trim($block))) {
|
|
return 'code';
|
|
}
|
|
|
|
if (preg_match('/^\s*#/', trim($block))) {
|
|
return 'header';
|
|
}
|
|
|
|
if (preg_match('/^\s*\|/', trim($block))) {
|
|
return 'table';
|
|
}
|
|
|
|
if (preg_match('/^\s*>\s/', trim($block))) {
|
|
return 'blockquote';
|
|
}
|
|
|
|
if (
|
|
preg_match('/^\s*-\s/', trim($block))
|
|
|| preg_match('/^\s*\d+\.\s/', trim($block))
|
|
) {
|
|
return 'list';
|
|
}
|
|
|
|
if (preg_match('/^\s*\[\^[\w-]+\]:/', trim($block))) {
|
|
return 'footnote';
|
|
}
|
|
|
|
return 'paragraph'; // Default to paragraph if no other type matches
|
|
}
|
|
|
|
/**
|
|
* Apply formatting rules to a single markdown block based on its type.
|
|
*
|
|
* @param string $block The markdown block to format.
|
|
* @param string $type The type of the block.
|
|
* @return string The formatted block.
|
|
*/
|
|
private function formatBlock(string $block, string $type): string
|
|
{
|
|
$block = trim($block, "\n");
|
|
|
|
switch ($type) {
|
|
case 'code':
|
|
return $this->formatCodeBlock($block);
|
|
case 'header':
|
|
return $this->formatHeaderBlock($block);
|
|
case 'table':
|
|
return $this->formatTableBlock($block);
|
|
case 'blockquote':
|
|
return $this->formatBlockquoteBlock($block);
|
|
case 'list':
|
|
return $this->formatListBlock($block);
|
|
case 'footnote':
|
|
return $this->formatFootnoteBlock($block);
|
|
default:
|
|
return $this->formatParagraphBlock($block);
|
|
}
|
|
}
|
|
|
|
private function formatCodeBlock(string $block): string
|
|
{
|
|
// Split the block into lines
|
|
$lines = explode("\n", $block);
|
|
|
|
// Clean the first line if it starts with ```
|
|
if (count($lines) > 0 && preg_match('/^```/', trim($lines[0]))) {
|
|
$lines[0] = preg_replace('/^(```\w*)\s*{.*?}$/', '$1', trim($lines[0]));
|
|
}
|
|
|
|
$last = count($lines) - 1;
|
|
|
|
$lines[$last] = '```';
|
|
|
|
// Reassemble the block
|
|
return implode("\n", $lines);
|
|
}
|
|
|
|
private function formatFootnoteBlock(string $block): string
|
|
{
|
|
// HTML-specific formatting
|
|
return $block;
|
|
}
|
|
|
|
private function formatHeaderBlock(string $block): string
|
|
{
|
|
// Header-specific formatting
|
|
return $this->replaceUnderscoresWithAsterisks($block);
|
|
}
|
|
|
|
private function formatTableBlock(string $block): string
|
|
{
|
|
// Table formatting
|
|
return $block;
|
|
}
|
|
|
|
private function formatBlockquoteBlock(string $block): string
|
|
{
|
|
// Blockquote-specific formatting
|
|
return $block;
|
|
}
|
|
|
|
private function formatListBlock(string $block): string
|
|
{
|
|
// List-specific formatting
|
|
return $block;
|
|
}
|
|
|
|
/**
|
|
* Apply formatting rules to a paragraph block.
|
|
*
|
|
* @param string $block The paragraph block to format.
|
|
* @return string The formatted paragraph block.
|
|
*/
|
|
private function formatParagraphBlock(string $block): string
|
|
{
|
|
// Normalize three dots and variants to the ellipsis character
|
|
$block = preg_replace('/\.{3}(?!\.)/', '…', $block);
|
|
|
|
// Remove unnecessary new lines within the paragraph
|
|
$block = str_replace("\n", ' ', $block);
|
|
|
|
// Normalize spaces (replace multiple spaces with a single space)
|
|
$block = preg_replace('/\s+/', ' ', $block);
|
|
|
|
// Avoid adding space in markdown links by temporarily replacing them
|
|
preg_match_all('/\[[^\]]+\]\([^\)]+\)/', $block, $links);
|
|
foreach ($links[0] as $index => $link) {
|
|
$block = str_replace($link, "link_placeholder_{$index}", $block);
|
|
}
|
|
|
|
// Add space after punctuation
|
|
$block = preg_replace('/(\S)([.!?…])(\s|$)/', '$1$2 ', $block);
|
|
|
|
// Restore links
|
|
foreach ($links[0] as $index => $link) {
|
|
$block = str_replace("link_placeholder_{$index}", $link, $block);
|
|
}
|
|
|
|
$delimiter = sprintf('/(?<=[%s])\s+/u', implode('', array_map('preg_quote', $this->phraseEndingChars)));
|
|
$sentences = preg_split($delimiter, $block, -1, PREG_SPLIT_NO_EMPTY);
|
|
$sentences = array_map(function ($sentence) {
|
|
// Replace underscores by asterisks when they are used as pairs and not part of markdown links
|
|
$sentence = $this->replaceUnderscoresWithAsterisks($sentence);
|
|
|
|
return trim($sentence);
|
|
}, $sentences);
|
|
|
|
// Join sentences by new lines
|
|
$formattedParagraph = implode("\n", $sentences);
|
|
|
|
return $formattedParagraph;
|
|
}
|
|
|
|
/**
|
|
* Replace underscores with asterisks when used in pairs, not affecting markdown links.
|
|
*
|
|
* @param string $sentence The sentence to process.
|
|
* @return string The processed sentence.
|
|
*/
|
|
private function replaceUnderscoresWithAsterisks(string $sentence): string
|
|
{
|
|
// Temporarily remove Markdown links to avoid processing underscores within them
|
|
$patterns = [
|
|
'/\[[^\]]+\]\([^\)]+\)/', // Match links of the form [text](link)
|
|
'/<[^>]+>/', // Match links of the form <link>
|
|
'/\[\^[^\]]+\]/', // Match footnote references of the form [^footnote]
|
|
];
|
|
|
|
$links = [];
|
|
foreach ($patterns as $pattern) {
|
|
preg_match_all($pattern, $sentence, $matches);
|
|
foreach ($matches[0] as $index => $match) {
|
|
// Store the link with a unique placeholder
|
|
$placeholder = sprintf('link-placeholder-%d-%d', count($links), $index);
|
|
$links[$placeholder] = $match;
|
|
$sentence = str_replace($match, $placeholder, $sentence);
|
|
}
|
|
}
|
|
|
|
// Replace all non-link underscore pairs
|
|
$sentence = preg_replace_callback('/(_[^_]+_)/', function ($matches) {
|
|
// Replace underscores with asterisks, but keep the content
|
|
return str_replace('_', '*', $matches[0]);
|
|
}, $sentence);
|
|
|
|
// Restore the links
|
|
foreach ($links as $placeholder => $link) {
|
|
$sentence = str_replace($placeholder, $link, $sentence);
|
|
}
|
|
|
|
return $sentence;
|
|
}
|
|
}
|