FilterUrl.php
Same filename and directory in other branches
Namespace
Drupal\filter\Plugin\FilterFile
-
core/
modules/ filter/ src/ Plugin/ Filter/ FilterUrl.php
View source
<?php
namespace Drupal\filter\Plugin\Filter;
use Drupal\Component\Utility\Html;
use Drupal\Component\Utility\Unicode;
use Drupal\Core\Form\FormStateInterface;
use Drupal\Core\Plugin\ContainerFactoryPluginInterface;
use Drupal\Core\StringTranslation\TranslatableMarkup;
use Drupal\filter\Attribute\Filter;
use Drupal\filter\FilterProcessResult;
use Drupal\filter\Plugin\FilterBase;
use Drupal\filter\Plugin\FilterInterface;
use Symfony\Component\DependencyInjection\Attribute\Autowire;
/**
* Provides a filter to convert URLs into links.
*/
class FilterUrl extends FilterBase implements ContainerFactoryPluginInterface {
/**
* The list of protocols that are allowed in URLs.
*/
protected array $filterProtocols;
/**
* Temporary storage for HTML comments.
*/
protected array $htmlComments;
public function __construct(array $configuration, $plugin_id, $plugin_definition, #[Autowire(param: 'filter_protocols')] ?array $filter_protocols = NULL) {
parent::__construct($configuration, $plugin_id, $plugin_definition);
if ($filter_protocols === NULL) {
@trigger_error('Calling ' . __METHOD__ . '() without the $filter_protocols argument is deprecated in drupal:11.4.0 and will be required in drupal:12.0.0. See https://www.drupal.org/node/3566774', E_USER_DEPRECATED);
$filter_protocols = \Drupal::getContainer()->getParameter('filter_protocols');
}
$this->filterProtocols = $filter_protocols;
}
/**
* {@inheritdoc}
*/
public function settingsForm(array $form, FormStateInterface $form_state) {
$form['filter_url_length'] = [
'#type' => 'number',
'#title' => $this->t('Maximum link text length'),
'#default_value' => $this->settings['filter_url_length'],
'#min' => 1,
'#field_suffix' => $this->t('characters'),
'#description' => $this->t('URLs longer than this number of characters will be truncated to prevent long strings that break formatting. The link itself will be retained; just the text portion of the link will be truncated.'),
];
return $form;
}
/**
* {@inheritdoc}
*/
public function process($text, $langcode) {
// Store the current text in case any of the preg_* functions fail.
$saved_text = $text;
// Tags to skip and not recurse into.
$ignore_tags = 'a|script|style|code|pre';
// Create an array which contains the regexps for each type of link. The key
// to the regexp is the name of a function that is used as a callback
// function to process matches of the regexp. The callback function is to
// return the replacement for the match. The array is used and
// matching/replacement done below inside some loops.
$tasks = [];
// Prepare protocols pattern for absolute URLs.
// \Drupal\Component\Utility\UrlHelper::stripDangerousProtocols() will
// replace any bad protocols with HTTP, so we need to support the identical
// list. While '//' is technically optional for MAILTO only, we cannot
// cleanly differ between protocols here without hard-coding MAILTO, so '//'
// is optional for all protocols.
// @see \Drupal\Component\Utility\UrlHelper::stripDangerousProtocols()
$protocols = implode(':(?://)?|', $this->filterProtocols) . ':(?://)?';
$valid_url_path_characters = "[\\p{L}\\p{M}\\p{N}!\\*\\';:=\\+,\\.\$\\/%#\\[\\]\\-_~@&]";
// Allow URL paths to contain balanced parens
// 1. Used in Wikipedia URLs like /Primer_(film)
// 2. Used in IIS sessions like /S(dfd346)/
$valid_url_balanced_parens = '\\(' . $valid_url_path_characters . '+\\)';
// Valid end-of-path characters (so /foo. does not gobble the period). Allow
// =&# for empty URL parameters and other URL-join artifacts
$valid_url_ending_characters = '[\\p{L}\\p{M}\\p{N}:_+~#=/]|(?:' . $valid_url_balanced_parens . ')';
$valid_url_query_chars = '[a-zA-Z0-9!?\\*\'@\\(\\);:&=\\+\\$\\/%#\\[\\]\\-_\\.,~|]';
$valid_url_query_ending_chars = '[a-zA-Z0-9_&=#\\/]';
// Full path and allow @ in a URL, but only in the middle. Catch things like
// http://example.com/@user/
$valid_url_path = '(?:(?:' . $valid_url_path_characters . '*(?:' . $valid_url_balanced_parens . $valid_url_path_characters . '*)*' . $valid_url_ending_characters . ')|(?:@' . $valid_url_path_characters . '+\\/))';
// Prepare the domain name pattern. The ICANN seems to be on track towards
// accepting more diverse top level domains (TLDs), so this pattern has been
// "future-proofed" to allow for TLDs of length 2-64.
$domain = '(?:[\\p{L}\\p{M}\\p{N}._+-]+\\.)?[\\p{L}\\p{M}]{2,64}\\b';
// Mail domains differ from the generic domain pattern, specifically: A "."
// character must be present in the string that follows the @ character.
$email_domain = '(?:[\\p{L}\\p{M}\\p{N}._+-]+\\.)+[\\p{L}\\p{M}]{2,64}\\b';
$ip = '(?:[0-9]{1,3}\\.){3}[0-9]{1,3}';
$auth = '[\\p{L}\\p{M}\\p{N}:%_+*~#?&=.,/;-]+@';
$trail = '(' . $valid_url_path . '*)?(\\?' . $valid_url_query_chars . '*' . $valid_url_query_ending_chars . ')?';
// Match absolute URLs.
$url_pattern = "(?:{$auth})?(?:{$domain}|{$ip})/?(?:{$trail})?";
$pattern = "`((?:{$protocols})(?:{$url_pattern}))`u";
$tasks[] = [
static::class . '::parseFullLinks',
$pattern,
];
// Match email addresses.
$url_pattern = "[\\p{L}\\p{M}\\p{N}._+-]{1,254}@(?:{$email_domain})";
$pattern = "`({$url_pattern})`u";
$tasks[] = [
static::class . '::parseEmailLinks',
$pattern,
];
// Match www domains.
$url_pattern = "www\\.(?:{$domain})/?(?:{$trail})?";
$pattern = "`({$url_pattern})`u";
$tasks[] = [
static::class . '::parsePartialLinks',
$pattern,
];
// Each type of URL needs to be processed separately. The text is joined and
// re-split after each task, since all injected HTML tags must be correctly
// protected before the next task.
foreach ($tasks as [$callback, $pattern]) {
// Initialize the HTML comment temporary storage.
// @see self::escapeComments()
// @see self::unescapeComments()
$this->htmlComments = [];
// HTML comments need to be handled separately, as they may contain HTML
// markup, especially a '>'. Therefore, remove all comment contents and
// add them back later.
$text = is_null($text) ? '' : preg_replace_callback('`<!--(.*?)-->`s', static::class . '::escapeComments', $text);
// Split at all tags; ensures that no tags or attributes are processed.
$chunks = is_null($text) ? [
'',
] : preg_split('/(<.+?>)/is', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
// Do not attempt to convert links into URLs if preg_split() fails.
if ($chunks !== FALSE) {
// PHP ensures that the array consists of alternating delimiters and
// literals, and begins and ends with a literal (inserting NULL as
// required). Therefore, the first chunk is always text:
$chunk_type = 'text';
// If a tag of $ignore_tags is found, it is stored in $open_tag and only
// removed when the closing tag is found. Until the closing tag is
// found, no replacements are made.
$open_tag = '';
for ($i = 0; $i < count($chunks); $i++) {
if ($chunk_type == 'text') {
// Only process this text if there are no unclosed $ignore_tags.
if ($open_tag == '') {
// If there is a match, inject a link into this chunk via the
// callback function contained in $task.
$chunks[$i] = preg_replace_callback($pattern, $callback, $chunks[$i]);
}
// Text chunk is done, so the next chunk must be a tag.
$chunk_type = 'tag';
}
else {
// Only process this tag if there are no unclosed $ignore_tags.
if ($open_tag == '') {
// Check whether this tag is contained in $ignore_tags.
if (preg_match("`<({$ignore_tags})(?:\\s|>)`i", $chunks[$i], $matches)) {
$open_tag = $matches[1];
}
}
else {
if (preg_match("`</{$open_tag}>`i", $chunks[$i], $matches)) {
$open_tag = '';
}
}
// Tag chunk is done, so the next chunk must be text.
$chunk_type = 'text';
}
}
$text = implode($chunks);
}
// Revert to the original comment contents.
$text = $text ? preg_replace_callback('`<!--(.*?)-->`', static::class . '::unescapeComments', $text) : $text;
}
// If there is no text at this point, revert to the previous text.
$text = strlen((string) $text) > 0 ? $text : $saved_text;
return new FilterProcessResult($text);
}
/**
* {@inheritdoc}
*/
public function tips($long = FALSE) {
return $this->t('Web page addresses and email addresses turn into links automatically.');
}
/**
* Makes links out of absolute URLs.
*
* Callback for preg_replace_callback() within self::process().
*
* @param array $match
* Regexp match array.
*
* @return string
* Parsed markup
*/
protected function parseFullLinks(array $match) : string {
// The $i:th parenthesis in the regexp contains the URL.
$i = 1;
$match[$i] = Html::decodeEntities($match[$i]);
$caption = Html::escape($this->trimUrl($match[$i]));
$match[$i] = Html::escape($match[$i]);
return '<a href="' . $match[$i] . '">' . $caption . '</a>';
}
/**
* Makes links out of email addresses.
*
* Callback for preg_replace_callback() within self::process().
*
* @param array $match
* Regexp match array.
*
* @return string
* Parsed markup
*/
protected function parseEmailLinks(array $match) : string {
// The $i:th parenthesis in the regexp contains the URL.
$i = 0;
$match[$i] = Html::decodeEntities($match[$i]);
$caption = Html::escape($this->trimUrl($match[$i]));
$match[$i] = Html::escape($match[$i]);
return '<a href="mailto:' . $match[$i] . '">' . $caption . '</a>';
}
/**
* Makes links out of domain names starting with "www.".
*
* Callback for preg_replace_callback() within self::process().
*
* @param array $match
* Regexp match array.
*
* @return string
* Parsed markup
*/
protected function parsePartialLinks(array $match) : string {
// The $i:th parenthesis in the regexp contains the URL.
$i = 1;
$match[$i] = Html::decodeEntities($match[$i]);
$caption = Html::escape($this->trimUrl($match[$i]));
$match[$i] = Html::escape($match[$i]);
return '<a href="http://' . $match[$i] . '">' . $caption . '</a>';
}
/**
* Escapes the contents of HTML comments.
*
* Callback for preg_replace_callback() within self::process(). Replaces all
* HTML comments with a '<!-- [hash] -->' placeholder.
*
* @param array<string> $match
* An array containing matches to replace from preg_replace_callback(),
* whereas $match[1] is expected to contain the content to be filtered.
*
* @return string
* The escaped comment.
*/
protected function escapeComments(array $match) : string {
$hash = hash('sha256', $match[1]);
$this->htmlComments[$hash] = $match[1];
return "<!-- {$hash} -->";
}
/**
* Unescapes the contents of HTML comments.
*
* Callback for preg_replace_callback() within self::process(). Replaces
* placeholders with actual comment contents.
*
* @param array<string> $match
* An array containing matches to replace from preg_replace_callback(),
* whereas $match[1] is expected to contain the content to be filtered.
*
* @return string
* The unescaped comment.
*/
protected function unescapeComments(array $match) : string {
$hash = $match[1];
$hash = trim($hash);
$content = $this->htmlComments[$hash];
return "<!--{$content}-->";
}
/**
* Shortens a long URL to a given length ending with an ellipsis.
*
* @param string $url
* The URL to shorten.
*
* @return string
* The shortened URL by a 'filter_url_length' setting cutoff.
*/
protected function trimUrl(string $url) : string {
return Unicode::truncate(string: $url, max_length: $this->settings['filter_url_length'], add_ellipsis: TRUE);
}
}
Classes
| Title | Deprecated | Summary |
|---|---|---|
| FilterUrl | Provides a filter to convert URLs into links. |
Buggy or inaccurate documentation? Please file an issue. Need support? Need help programming? Connect with the Drupal community.