1
0
cms11/app/Services/Markdown/Linter.php

329 lines
10 KiB
PHP
Raw Normal View History

<?php
namespace App\Services\Markdown;
/**
* Linter class is responsible for formatting markdown content.
*/
class Linter
{
/**
* The markdown content to be formatted.
*
* @var string
*/
private $markdown;
/**
* Characters that mark the end of a sentence.
*
* @var array
*/
private $phraseEndingChars = ['.', '!', '?'];
private $htmlPlaceholders = [];
/**
* Constructor takes markdown content and prepares it for formatting.
*
* @param string $markdown Markdown content to format.
*/
public function __construct(?string $markdown = '')
{
$this->markdown = mb_convert_encoding($markdown, 'UTF-8');
}
/**
* Format the markdown content by applying various formatting rules.
*
* @return string The formatted markdown.
*/
public function format(): string
{
return $this->markdown;
$this->markdown = $this->replaceHtmlWithPlaceholders($this->markdown);
if (empty($this->markdown)) {
return $this->markdown;
}
$blocks = $this->segmentMarkdown();
$processedBlocks = array_map(function ($block) {
$type = $this->determineBlockType($block);
return $this->formatBlock($block, $type);
}, $blocks);
$result = implode("\n\n", $processedBlocks);
$result = $this->restoreHtmlFromPlaceholders($result);
return $result;
}
private function restoreHtmlFromPlaceholders($text)
{
return str_replace(array_keys($this->htmlPlaceholders), array_values($this->htmlPlaceholders), $text);
}
private function replaceHtmlWithPlaceholders($text)
{
$index = 0;
return preg_replace_callback('/<([a-zA-Z0-9\-]+)([^>]*)>(.*?)<\/\1>/s', function ($matches) use (&$index) {
$placeholder = "<!-- html-placeholder-{$index} -->";
$this->htmlPlaceholders[$placeholder] = $matches[0]; // Store the whole match
$index++;
return $placeholder;
}, $text);
}
private function segmentMarkdown(): array
{
$blocks = [];
$currentBlock = '';
$lines = explode("\n", $this->markdown);
$inCodeBlock = false;
foreach ($lines as $line) {
if (preg_match('/^```/', trim($line))) {
if ($inCodeBlock) {
// End of a code block
$currentBlock .= $line . "\n";
$blocks[] = $currentBlock;
$currentBlock = '';
$inCodeBlock = false;
} else {
// Start of a code block
if (!empty($currentBlock)) {
$blocks[] = $currentBlock;
$currentBlock = '';
}
$inCodeBlock = true;
$currentBlock .= $line . "\n";
}
} elseif ($inCodeBlock) {
// Inside a code block
$currentBlock .= $line . "\n";
} else {
// Normal line processing
if (trim($line) === '' && trim($currentBlock) !== '') {
$blocks[] = $currentBlock;
$currentBlock = '';
} else {
$currentBlock .= $line . "\n";
}
}
}
// Add the last block if not empty
if (!empty(trim($currentBlock))) {
$blocks[] = $currentBlock;
}
return $blocks;
}
/**
* Determine the type of a markdown block.
*
* @param string $block The markdown block to analyze.
* @return string The type of the block.
*/
private function determineBlockType(string $block): string
{
if (preg_match('/^\s*```/', trim($block))) {
return 'code';
}
if (preg_match('/^\s*#/', trim($block))) {
return 'header';
}
if (preg_match('/^\s*\|/', trim($block))) {
return 'table';
}
if (preg_match('/^\s*>\s/', trim($block))) {
return 'blockquote';
}
if (
preg_match('/^\s*-\s/', trim($block))
|| preg_match('/^\s*\d+\.\s/', trim($block))
) {
return 'list';
}
if (preg_match('/^\s*\[\^[\w-]+\]:/', trim($block))) {
return 'footnote';
}
return 'paragraph'; // Default to paragraph if no other type matches
}
/**
* Apply formatting rules to a single markdown block based on its type.
*
* @param string $block The markdown block to format.
* @param string $type The type of the block.
* @return string The formatted block.
*/
private function formatBlock(string $block, string $type): string
{
$block = trim($block, "\n");
switch ($type) {
case 'code':
return $this->formatCodeBlock($block);
case 'header':
return $this->formatHeaderBlock($block);
case 'table':
return $this->formatTableBlock($block);
case 'blockquote':
return $this->formatBlockquoteBlock($block);
case 'list':
return $this->formatListBlock($block);
case 'footnote':
return $this->formatFootnoteBlock($block);
default:
return $this->formatParagraphBlock($block);
}
}
private function formatCodeBlock(string $block): string
{
// Split the block into lines
$lines = explode("\n", $block);
// Clean the first line if it starts with ```
if (count($lines) > 0 && preg_match('/^```/', trim($lines[0]))) {
$lines[0] = preg_replace('/^(```\w*)\s*{.*?}$/', '$1', trim($lines[0]));
}
2024-04-21 23:45:09 +02:00
$last = count($lines) - 1;
$lines[$last] = '```';
// Reassemble the block
return implode("\n", $lines);
}
private function formatFootnoteBlock(string $block): string
{
// HTML-specific formatting
return $block;
}
private function formatHeaderBlock(string $block): string
{
// Header-specific formatting
return $this->replaceUnderscoresWithAsterisks($block);
}
private function formatTableBlock(string $block): string
{
// Table formatting
return $block;
}
private function formatBlockquoteBlock(string $block): string
{
// Blockquote-specific formatting
return $block;
}
private function formatListBlock(string $block): string
{
// List-specific formatting
return $block;
}
/**
* Apply formatting rules to a paragraph block.
*
* @param string $block The paragraph block to format.
* @return string The formatted paragraph block.
*/
private function formatParagraphBlock(string $block): string
{
// Normalize three dots and variants to the ellipsis character
$block = preg_replace('/\.{3}(?!\.)/', '…', $block);
// Remove unnecessary new lines within the paragraph
$block = str_replace("\n", ' ', $block);
// Normalize spaces (replace multiple spaces with a single space)
$block = preg_replace('/\s+/', ' ', $block);
// Avoid adding space in markdown links by temporarily replacing them
preg_match_all('/\[[^\]]+\]\([^\)]+\)/', $block, $links);
foreach ($links[0] as $index => $link) {
$block = str_replace($link, "link_placeholder_{$index}", $block);
}
// Add space after punctuation
$block = preg_replace('/(\S)([.!?…])(\s|$)/', '$1$2 ', $block);
// Restore links
foreach ($links[0] as $index => $link) {
$block = str_replace("link_placeholder_{$index}", $link, $block);
}
$delimiter = sprintf('/(?<=[%s])\s+/u', implode('', array_map('preg_quote', $this->phraseEndingChars)));
$sentences = preg_split($delimiter, $block, -1, PREG_SPLIT_NO_EMPTY);
$sentences = array_map(function ($sentence) {
// Replace underscores by asterisks when they are used as pairs and not part of markdown links
$sentence = $this->replaceUnderscoresWithAsterisks($sentence);
return trim($sentence);
}, $sentences);
// Join sentences by new lines
$formattedParagraph = implode("\n", $sentences);
return $formattedParagraph;
}
/**
* Replace underscores with asterisks when used in pairs, not affecting markdown links.
*
* @param string $sentence The sentence to process.
* @return string The processed sentence.
*/
private function replaceUnderscoresWithAsterisks(string $sentence): string
{
// Temporarily remove Markdown links to avoid processing underscores within them
$patterns = [
'/\[[^\]]+\]\([^\)]+\)/', // Match links of the form [text](link)
'/<[^>]+>/', // Match links of the form <link>
'/\[\^[^\]]+\]/', // Match footnote references of the form [^footnote]
];
$links = [];
foreach ($patterns as $pattern) {
preg_match_all($pattern, $sentence, $matches);
foreach ($matches[0] as $index => $match) {
// Store the link with a unique placeholder
$placeholder = sprintf('link-placeholder-%d-%d', count($links), $index);
$links[$placeholder] = $match;
$sentence = str_replace($match, $placeholder, $sentence);
}
}
// Replace all non-link underscore pairs
$sentence = preg_replace_callback('/(_[^_]+_)/', function ($matches) {
// Replace underscores with asterisks, but keep the content
return str_replace('_', '*', $matches[0]);
}, $sentence);
// Restore the links
foreach ($links as $placeholder => $link) {
$sentence = str_replace($placeholder, $link, $sentence);
}
return $sentence;
}
}