diff --git a/Languages/en_US/General.php b/Languages/en_US/General.php index 9c7d9460ea..dd58718e6f 100644 --- a/Languages/en_US/General.php +++ b/Languages/en_US/General.php @@ -12,8 +12,6 @@ //https://developers.google.com/recaptcha/docs/language $txt['lang_recaptcha'] = 'en'; -// Ensure you remember to use uppercase for character set strings. -$txt['lang_character_set'] = 'UTF-8'; // Character set right to left? 0 = ltr; 1 = rtl $txt['lang_rtl'] = '0'; diff --git a/Languages/en_US/Install.php b/Languages/en_US/Install.php index d2083d22b8..0626de28f6 100644 --- a/Languages/en_US/Install.php +++ b/Languages/en_US/Install.php @@ -2,8 +2,7 @@ // Version: 3.0 Alpha 2; Install -// These should be the same as those in index.language.php. -$txt['lang_character_set'] = 'UTF-8'; +// This should be the same as the one in General.php. $txt['lang_rtl'] = '0'; $txt['install_step_welcome'] = 'Welcome'; diff --git a/Sources/Actions/Admin/ACP.php b/Sources/Actions/Admin/ACP.php index bf954485ae..69467c0308 100644 --- a/Sources/Actions/Admin/ACP.php +++ b/Sources/Actions/Admin/ACP.php @@ -1059,7 +1059,7 @@ public static function saveSettings(array &$config_vars): void // Fix the darn stupid cookiename! (more may not be allowed, but these for sure!) if (isset($_POST['cookiename'])) { - $_POST['cookiename'] = preg_replace('~[,;\s\.$]+~' . (Utils::$context['utf8'] ? 'u' : ''), '', $_POST['cookiename']); + $_POST['cookiename'] = preg_replace('~[,;\s\.$]+~u', '', $_POST['cookiename']); } // Fix the forum's URL if necessary. diff --git a/Sources/Actions/Admin/Languages.php b/Sources/Actions/Admin/Languages.php index b879d2715f..2d617664aa 100644 --- a/Sources/Actions/Admin/Languages.php +++ b/Sources/Actions/Admin/Languages.php @@ -887,7 +887,11 @@ function ($val1, $val2) { $replace_array = []; foreach ($primary_settings as $setting => $type) { - $replace_array['~\$txt\[\'' . $setting . '\'\]\s*=\s*[^\r\n]+~'] = '$txt[\'' . $setting . '\'] = ' . ($type === 'bool' ? (!empty($_POST[$setting]) ? 'true' : 'false') : '\'' . ($setting = 'native_name' ? htmlentities(Utils::htmlspecialcharsDecode($_POST[$setting]), ENT_QUOTES, Utils::$context['character_set']) : preg_replace('~[^\w-]~i', '', $_POST[$setting])) . '\'') . ';'; + if ($setting === 'lang_character_set') { + $replace_array['/\$txt\[\'' . $setting . '\'\]\s*=\s*[^\r\n]+\R/u'] = ''; + } else { + $replace_array['~\$txt\[\'' . $setting . '\'\]\s*=\s*[^\r\n]+~u'] = '$txt[\'' . $setting . '\'] = ' . ($type === 'bool' ? (!empty($_POST[$setting]) ? '\'1\'' : '\'0\'') : '\'' . ($setting = 'native_name' ? htmlentities(Utils::htmlspecialcharsDecode($_POST[$setting]), ENT_QUOTES, 'UTF-8') : preg_replace('~[^\w-]~i', '', $_POST[$setting])) . '\'') . ';'; + } } $current_data = preg_replace(array_keys($replace_array), array_values($replace_array), $current_data); @@ -910,6 +914,10 @@ function ($val1, $val2) { Utils::$context['primary_settings']['name'] = Utils::ucwords(strtr($lang_id, ['_' => ' ', '-utf8' => ''])); foreach ($primary_settings as $setting => $type) { + if ($setting === 'lang_character_set') { + continue; + } + Utils::$context['primary_settings'][$setting] = [ 'label' => str_replace('lang_', '', $setting), 'value' => $type === 'bool' ? !empty(Lang::$txt[$setting]) : Lang::$txt[$setting], @@ -1005,12 +1013,12 @@ function ($val1, $val2) { // Also, remove any lines for uneditable variables like $forum_copyright from the working data. $entries = []; - foreach (preg_split('~^(?=\$(?:' . implode('|', $string_types) . ')\[\'([^\n]+?)\'\])~m' . (Utils::$context['utf8'] ? 'u' : ''), preg_replace('~\s*\n(\$(?!(?:' . implode('|', $string_types) . '))[^\n]*)~', '', file_get_contents($current_file))) as $blob) { + foreach (preg_split('~^(?=\$(?:' . implode('|', $string_types) . ')\[\'([^\n]+?)\'\])~mu', preg_replace('~\s*\n(\$(?!(?:' . implode('|', $string_types) . '))[^\n]*)~', '', file_get_contents($current_file))) as $blob) { // Comment lines at the end of the blob can make terrible messes - $blob = preg_replace('~(\n[ \t]*//[^\n]*)*$~' . (Utils::$context['utf8'] ? 'u' : ''), '', $blob); + $blob = preg_replace('~(\n[ \t]*//[^\n]*)*$~u', '', $blob); // Extract the variable - if (preg_match('~^\$(' . implode('|', $string_types) . ')\[\'([^\n]+?)\'\](?:\[\'?([^\n]+?)\'?\])?\s?=\s?(.+);([ \t]*(?://[^\n]*)?)$~ms' . (Utils::$context['utf8'] ? 'u' : ''), strtr($blob, ["\r" => '']), $matches)) { + if (preg_match('~^\$(' . implode('|', $string_types) . ')\[\'([^\n]+?)\'\](?:\[\'?([^\n]+?)\'?\])?\s?=\s?(.+);([ \t]*(?://[^\n]*)?)$~msu', strtr($blob, ["\r" => '']), $matches)) { // If no valid subkey was found, we need it to be explicitly null $matches[3] = isset($matches[3]) && $matches[3] !== '' ? $matches[3] : null; @@ -1093,7 +1101,7 @@ function ($val1, $val2) { # Followed by a comma or the end of the string (?=,|$) - /x' . (Utils::$context['utf8'] ? 'u' : ''), $entryValue['entry'], $matches); + /xu', $entryValue['entry'], $matches); if (empty($m)) { continue; @@ -1325,7 +1333,7 @@ function ($val1, $val2) { // Apply our changes. foreach ($final_saves as $save) { if (!empty($save['is_regex'])) { - $file_contents = preg_replace('~' . $save['find'] . '~' . (Utils::$context['utf8'] ? 'u' : ''), $save['replace'], $file_contents); + $file_contents = preg_replace('~' . $save['find'] . '~u', $save['replace'], $file_contents); } else { $file_contents = str_replace($save['find'], $save['replace'], $file_contents); } @@ -1567,7 +1575,7 @@ public static function list_getLanguages(): array $languages[$lang['filename']] = [ 'id' => $lang['filename'], 'count' => 0, - 'char_set' => $txt['lang_character_set'], + 'char_set' => 'UTF-8', 'default' => Lang::$default == $lang['filename'] || (Lang::$default == '' && $lang['filename'] == 'en_US'), 'locale' => $txt['lang_locale'], 'name' => $lang['name'], diff --git a/Sources/Actions/Admin/Maintenance.php b/Sources/Actions/Admin/Maintenance.php index e8db19d2c2..c0f5414017 100644 --- a/Sources/Actions/Admin/Maintenance.php +++ b/Sources/Actions/Admin/Maintenance.php @@ -170,7 +170,7 @@ public function routine(): void public function database(): void { // Show some conversion options? - Utils::$context['convert_entities'] = isset(Config::$modSettings['global_character_set']) && Config::$modSettings['global_character_set'] === 'UTF-8'; + Utils::$context['convert_entities'] = true; if (Config::$db_type == 'mysql') { $colData = Db::$db->list_columns('{db_prefix}messages', true); @@ -1024,11 +1024,6 @@ public function entitiesToUnicode(): void { User::$me->isAllowedTo('admin_forum'); - // Check to see if UTF-8 is currently the default character set. - if (Config::$modSettings['global_character_set'] !== 'UTF-8') { - ErrorHandler::fatalLang('entity_convert_only_utf8'); - } - // Some starting values. Utils::$context['table'] = empty($_REQUEST['table']) ? 0 : (int) $_REQUEST['table']; Utils::$context['start'] = empty($_REQUEST['start']) ? 0 : (int) $_REQUEST['start']; diff --git a/Sources/Actions/AttachmentDownload.php b/Sources/Actions/AttachmentDownload.php index 9acc7d4450..492557a28b 100644 --- a/Sources/Actions/AttachmentDownload.php +++ b/Sources/Actions/AttachmentDownload.php @@ -23,7 +23,6 @@ use SMF\Config; use SMF\Db\DatabaseApi as Db; use SMF\IntegrationHook; -use SMF\Lang; use SMF\User; use SMF\Utils; @@ -320,15 +319,6 @@ public function execute(): void */ protected function __construct() { - // Some defaults that we need. - if (!isset(Utils::$context['character_set'])) { - Utils::$context['character_set'] = empty(Config::$modSettings['global_character_set']) ? (empty(Lang::$txt['lang_character_set']) ? 'ISO-8859-1' : Lang::$txt['lang_character_set']) : Config::$modSettings['global_character_set']; - } - - if (!isset(Utils::$context['utf8'])) { - Utils::$context['utf8'] = Utils::$context['character_set'] === 'UTF-8'; - } - // Which attachment was requested? $this->id = $_REQUEST['attach'] = isset($_REQUEST['attach']) ? (int) $_REQUEST['attach'] : (int) (isset($_REQUEST['id']) ? (int) $_REQUEST['id'] : 0); diff --git a/Sources/Actions/AttachmentUpload.php b/Sources/Actions/AttachmentUpload.php index ad92aac40d..7de9db67e0 100644 --- a/Sources/Actions/AttachmentUpload.php +++ b/Sources/Actions/AttachmentUpload.php @@ -558,7 +558,7 @@ protected function sendResponse(): void } // Set the header. - header('content-type: application/json; charset=' . Utils::$context['character_set'] . ''); + header('content-type: application/json; charset=UTF-8'); echo Utils::jsonEncode($this->_response ? $this->_response : []); diff --git a/Sources/Actions/Feed.php b/Sources/Actions/Feed.php index 746798c604..9e7ee3ed6c 100644 --- a/Sources/Actions/Feed.php +++ b/Sources/Actions/Feed.php @@ -535,11 +535,11 @@ public function emit(): void $filename[] = $this->format; - $filename = preg_replace(Utils::$context['utf8'] ? '/[^\p{L}\p{M}\p{N}\-]+/u' : '/[\s_,.\/\\;:\'<>?|\[\]{}~!@#$%^&*()=+`]+/', '_', str_replace('"', '', Utils::htmlspecialcharsDecode(strip_tags(implode('-', $filename))))); + $filename = preg_replace('/[^\p{L}\p{M}\p{N}\-]+/u', '_', str_replace('"', '', Utils::htmlspecialcharsDecode(strip_tags(implode('-', $filename))))); $file = [ 'filename' => $filename . '.xml', - 'mime_type' => self::MIME_TYPES[$this->format] . '; charset=' . (empty(Utils::$context['character_set']) ? 'UTF-8' : Utils::$context['character_set']), + 'mime_type' => self::MIME_TYPES[$this->format] . '; charset=UTF-8', 'content' => implode('', $this->xml), 'disposition' => isset($_GET['download']) ? 'attachment' : 'inline', ]; @@ -2765,7 +2765,7 @@ public static function build(string $format, array $data, array $metadata, strin Utils::$context['feed'] = []; // First, output the xml header. - Utils::$context['feed']['header'] = '' . ($doctype !== '' ? "\n" . trim($doctype) : ''); + Utils::$context['feed']['header'] = '<' . '?xml version="1.0" encoding="UTF-8"?' . '>' . ($doctype !== '' ? "\n" . trim($doctype) : ''); // Are we outputting an rss feed or one with more information? if ($format == 'rss' || $format == 'rss2') { diff --git a/Sources/Actions/Login2.php b/Sources/Actions/Login2.php index 2503b9993b..410a68d3f0 100644 --- a/Sources/Actions/Login2.php +++ b/Sources/Actions/Login2.php @@ -519,7 +519,7 @@ protected function checkPasswordFallbacks(): bool $other_passwords[] = sha1(strtolower(User::$profiles[User::$my_id]['member_name']) . Utils::htmlspecialcharsDecode($_POST['passwrd'])); // Perhaps we converted to UTF-8 and have a valid password being hashed differently. - if (Utils::$context['character_set'] == 'UTF-8' && !empty(Config::$modSettings['previousCharacterSet']) && Config::$modSettings['previousCharacterSet'] != 'utf8') { + if (!empty(Config::$modSettings['previousCharacterSet']) && Config::$modSettings['previousCharacterSet'] != 'utf8') { // Try iconv first, for no particular reason. if (function_exists('iconv')) { $other_passwords['iconv'] = sha1(strtolower(iconv('UTF-8', Config::$modSettings['previousCharacterSet'], User::$profiles[User::$my_id]['member_name'])) . Utils::htmlspecialcharsDecode(iconv('UTF-8', Config::$modSettings['previousCharacterSet'], $_POST['passwrd']))); diff --git a/Sources/Actions/Memberlist.php b/Sources/Actions/Memberlist.php index d9f2eb36a6..0698e52a01 100644 --- a/Sources/Actions/Memberlist.php +++ b/Sources/Actions/Memberlist.php @@ -293,7 +293,7 @@ public function all(): void } if (!is_numeric($_REQUEST['start'])) { - if (preg_match('~^[^\'\\\\/]~' . (Utils::$context['utf8'] ? 'u' : ''), Utils::strtolower($_REQUEST['start']), $match) === 0) { + if (preg_match('~^[^\'\\\\/]~u', Utils::strtolower($_REQUEST['start']), $match) === 0) { ErrorHandler::fatal('Are you a wannabe hacker?', false); } diff --git a/Sources/Actions/RequestMembers.php b/Sources/Actions/RequestMembers.php index e30ef7a39f..81cfe5b099 100644 --- a/Sources/Actions/RequestMembers.php +++ b/Sources/Actions/RequestMembers.php @@ -60,9 +60,7 @@ public function execute(): void { User::$me->checkSession('get'); - if (Utils::$context['utf8'] || function_exists('mb_convert_encoding')) { - header('content-type: text/plain; charset=UTF-8'); - } + header('content-type: text/plain; charset=UTF-8'); $request = Db::$db->query( '', @@ -81,15 +79,9 @@ public function execute(): void ); while ($row = Db::$db->fetch_assoc($request)) { - if (!Utils::$context['utf8']) { - if (($temp = @mb_convert_encoding($row['real_name'], 'UTF-8', Utils::$context['character_set'])) !== false) { - $row['real_name'] = $temp; - } - } - $row['real_name'] = strtr($row['real_name'], ['&' => '&', '<' => '<', '>' => '>', '"' => '"']); - $row['real_name'] = Utils::entityDecode($row['real_name'], true); + $row['real_name'] = Utils::entityDecode($row['real_name']); echo $row['real_name'], "\n"; } diff --git a/Sources/Autolinker.php b/Sources/Autolinker.php index b7a96dcd1e..c147a2e859 100644 --- a/Sources/Autolinker.php +++ b/Sources/Autolinker.php @@ -154,13 +154,6 @@ class Autolinker * Internal properties *********************/ - /** - * @var string - * - * The character encoding being used. - */ - protected string $encoding = 'UTF-8'; - /** * @var bool * @@ -258,20 +251,6 @@ public function __construct(bool $only_basic = false) { $this->only_basic = $only_basic; - if (!empty(Utils::$context['utf8'])) { - $this->encoding = 'UTF-8'; - } else { - $this->encoding = !empty(Config::$modSettings['global_character_set']) ? Config::$modSettings['global_character_set'] : (!empty(Lang::$txt['lang_character_set']) ? Lang::$txt['lang_character_set'] : $this->encoding); - - if (in_array($this->encoding, mb_encoding_aliases('UTF-8'))) { - $this->encoding = 'UTF-8'; - } - } - - if ($this->encoding !== 'UTF-8') { - self::$domain_label_chars = '0-9A-Za-z\-'; - } - // In case a mod wants to control behaviour for a special URI scheme. if (!self::$integrate_autolinker_schemes_done) { IntegrationHook::call('integrate_autolinker_schemes', [&self::$schemes]); @@ -409,14 +388,14 @@ public function detectUrls(string $string, bool $plaintext_only = false): array '((?' . '>' . '[^\[]|\[/?(?!' . $no_autolink_regex . ')' . '|(?1))*)' . // 4 = Closing BBC markup element. '(\[/\2\])' . - '~i' . ($this->encoding === 'UTF-8' ? 'u' : ''), + '~iu', fn($matches) => $matches[1] . str_repeat('x', strlen($matches[3])) . $matches[4], $string, ); // Overwrite all BBC markup elements. $string = preg_replace_callback( - '~\[/?' . Parser::getBBCodeTagsRegex() . '[^\]]*\]~i' . ($this->encoding === 'UTF-8' ? 'u' : ''), + '~\[/?' . Parser::getBBCodeTagsRegex() . '[^\]]*\]~iu', fn($matches) => str_repeat(' ', strlen($matches[0])), $string, ); @@ -430,21 +409,21 @@ public function detectUrls(string $string, bool $plaintext_only = false): array '((?' . '>' . '[^<]|)' . - '~i' . ($this->encoding === 'UTF-8' ? 'u' : ''), + '~iu', fn($matches) => $matches[1] . str_repeat('x', strlen($matches[2])) . $matches[3], $string, ); // Overwrite all HTML elements. $string = preg_replace_callback( - '~]*)>~i' . ($this->encoding === 'UTF-8' ? 'u' : ''), + '~]*)>~iu', fn($matches) => str_repeat(' ', strlen($matches[0])), $string, ); } preg_match_all( - '~' . $this->url_regex . '~i' . ($this->encoding === 'UTF-8' ? 'u' : ''), + '~' . $this->url_regex . '~iu', $string, $matches, PREG_OFFSET_CAPTURE, @@ -482,7 +461,7 @@ public function detectEmails(string $string, bool $plaintext_only = false): arra $this->setEmailRegex(); preg_match_all( - '~' . ($plaintext_only ? '(?:^|\s|
)\K' : '') . $this->email_regex . '~i' . ($this->encoding === 'UTF-8' ? 'u' : ''), + '~' . ($plaintext_only ? '(?:^|\s|
)\K' : '') . $this->email_regex . '~iu', $string, $matches, PREG_OFFSET_CAPTURE, @@ -834,7 +813,7 @@ protected function setTldRegex(): void return; } - if (!$this->only_basic && $this->encoding === 'UTF-8') { + if (!$this->only_basic) { Url::setTldRegex(); $this->tld_regex = Config::$modSettings['tld_regex']; } else { diff --git a/Sources/Config.php b/Sources/Config.php index f04b024433..07153ec742 100644 --- a/Sources/Config.php +++ b/Sources/Config.php @@ -1130,9 +1130,7 @@ public static function reloadModSettings(): void self::updateModSettings(['forum_uuid' => Uuid::getNamespace()]); } - // Here to justify the name of this function. :P - // It should be added to the install and upgrade scripts. - // But since the converters need to be updated also. This is easier. + // Ensure the attachment upload directory settings are valid. if (empty(self::$modSettings['currentAttachmentUploadDir'])) { self::updateModSettings([ 'attachmentUploadDir' => Utils::jsonEncode([1 => self::$modSettings['attachmentUploadDir']]), @@ -1147,6 +1145,11 @@ public static function reloadModSettings(): void self::$modSettings['attachmentSizeLimit'] = empty(self::$modSettings['attachmentSizeLimit']) ? $file_max_kb : min(self::$modSettings['attachmentSizeLimit'], $file_max_kb); self::$modSettings['attachmentNumPerPostLimit'] = !isset(self::$modSettings['attachmentNumPerPostLimit']) ? 4 : self::$modSettings['attachmentNumPerPostLimit']; + // Deprecated, but some old mods might use it. + if (!empty(self::$backward_compatibility)) { + self::$modSettings['global_character_set'] = 'UTF-8'; + } + // Integration is cool. if (defined('SMF_INTEGRATION_SETTINGS')) { $integration_settings = Utils::jsonDecode(SMF_INTEGRATION_SETTINGS, true); diff --git a/Sources/Draft.php b/Sources/Draft.php index b586ec35b1..33e090d044 100644 --- a/Sources/Draft.php +++ b/Sources/Draft.php @@ -801,9 +801,9 @@ protected static function xml(int $id_draft): void { Lang::load('Drafts'); - header('content-type: text/xml; charset=' . (empty(Utils::$context['character_set']) ? 'ISO-8859-1' : Utils::$context['character_set'])); + header('content-type: text/xml; charset=UTF-8'); - echo ' + echo '<' . '?xml version="1.0" encoding="UTF-8"?' . '> Time::create('@' . Utils::$context['draft_saved_on'])->format()]), ']]> '; diff --git a/Sources/Editor.php b/Sources/Editor.php index 21c12ba28f..00b51ef9a6 100644 --- a/Sources/Editor.php +++ b/Sources/Editor.php @@ -413,7 +413,7 @@ protected function init(): void // Some hidden information is needed in order to make the spell checking work. if (!isset($_REQUEST['xml'])) { Utils::$context['insert_after_template'] .= ' -
+
'; } diff --git a/Sources/Lang.php b/Sources/Lang.php index 1e9e160d81..753aed9db4 100644 --- a/Sources/Lang.php +++ b/Sources/Lang.php @@ -302,11 +302,15 @@ public static function load(string $template_name, string $lang = '', bool $fata if (str_contains(self::$txt['lang_locale'], '.')) { $locale_variants = self::$txt['lang_locale']; } else { - $locale_variants = array_unique(array_merge( - !empty(Config::$modSettings['global_character_set']) ? [self::$txt['lang_locale'] . '.' . Config::$modSettings['global_character_set']] : [], - !empty(Utils::$context['utf8']) ? [self::$txt['lang_locale'] . '.UTF-8', self::$txt['lang_locale'] . '.UTF8', self::$txt['lang_locale'] . '.utf-8', self::$txt['lang_locale'] . '.utf8'] : [], - [self::$txt['lang_locale']], - )); + $locale_variants = array_unique( + [ + self::$txt['lang_locale'] . '.UTF-8', + self::$txt['lang_locale'] . '.UTF8', + self::$txt['lang_locale'] . '.utf-8', + self::$txt['lang_locale'] . '.utf8', + self::$txt['lang_locale'], + ], + ); } setlocale(LC_CTYPE, $locale_variants); @@ -410,7 +414,6 @@ public static function addDirs(array|string $custom_dirs = []): void /** * Attempt to reload our known languages. - * It will try to choose only utf8 or non-utf8 languages. * * @param bool $use_cache Whether or not to use the cache * @return array An array of information about available languages @@ -606,8 +609,6 @@ public static function censorText(string &$text, bool $force = false): string $censor_vulgar = explode("\n", Config::$modSettings['censor_vulgar']); $censor_proper = explode("\n", Config::$modSettings['censor_proper']); - $charset = empty(Config::$modSettings['global_character_set']) ? self::$txt['lang_character_set'] : Config::$modSettings['global_character_set']; - // Quote them for use in regular expressions. for ($i = 0, $n = count($censor_vulgar); $i < $n; $i++) { // If a word is replaced with itself, just leave it as it is. @@ -624,13 +625,13 @@ public static function censorText(string &$text, bool $force = false): string if (!empty(Config::$modSettings['censorWholeWord'])) { // Use the faster \b if we can, or something more complex if we can't - $boundary_before = preg_match('/^\w/', $censor_vulgar[$i]) ? '\b' : ($charset === 'UTF-8' ? '(? 128) { - $simple = false; - } - } - unset($matches); - - if ($simple) { - $string = preg_replace_callback( - '~&#(\d{3,8});~', - function ($m) { - return chr((int) "{$m[1]}"); - }, - $string, - ); - } else { - // Try to convert the string to UTF-8. - if (!Utils::$context['utf8'] && function_exists('iconv')) { - $newstring = @iconv(Utils::$context['character_set'], 'UTF-8', $string); - - if ($newstring) { - $string = $newstring; - } - } - - $string = Utils::entityDecode($string, true); - - // Unicode, baby. - $charset = 'UTF-8'; - } + if (isset($custom_charset)) { + $string = mb_convert_encoding($string, 'UTF-8', $custom_charset); } - // Convert all special characters to HTML entities...just for Hotmail :-\ - if ($hotmail_fix && (Utils::$context['utf8'] || function_exists('iconv') || Utils::$context['character_set'] === 'ISO-8859-1')) { - if (!Utils::$context['utf8'] && function_exists('iconv')) { - $newstring = @iconv(Utils::$context['character_set'], 'UTF-8', $string); - - if ($newstring) { - $string = $newstring; - } - } + $string = Utils::entityDecode($string); - $entityConvert = function ($m) { - $c = $m[1]; - - if (strlen($c) === 1 && ord($c[0]) <= 0x7F) { - return $c; - } - - if (strlen($c) === 2 && ord($c[0]) >= 0xC0 && ord($c[0]) <= 0xDF) { - return '&#' . (((ord($c[0]) ^ 0xC0) << 6) + (ord($c[1]) ^ 0x80)) . ';'; - } - - if (strlen($c) === 3 && ord($c[0]) >= 0xE0 && ord($c[0]) <= 0xEF) { - return '&#' . (((ord($c[0]) ^ 0xE0) << 12) + ((ord($c[1]) ^ 0x80) << 6) + (ord($c[2]) ^ 0x80)) . ';'; - } - - if (strlen($c) === 4 && ord($c[0]) >= 0xF0 && ord($c[0]) <= 0xF7) { - return '&#' . (((ord($c[0]) ^ 0xF0) << 18) + ((ord($c[1]) ^ 0x80) << 12) + ((ord($c[2]) ^ 0x80) << 6) + (ord($c[3]) ^ 0x80)) . ';'; - } - - return ''; - }; - - // Convert all 'special' characters to HTML entities. - return [$charset, preg_replace_callback('~([\x80-\x{10FFFF}])~u', $entityConvert, $string), '7bit']; + // Convert all special characters to HTML entities...just for Hotmail :-\ + if ($hotmail_fix) { + return ['UTF-8', mb_encode_numericentity($string, [0x80, 0x10FFFF, 0, 0xFFFFFF], 'UTF-8'), '7bit']; } // We don't need to mess with the subject line if no special characters were in it.. - if (!$hotmail_fix && preg_match('~([^\x09\x0A\x0D\x20-\x7F])~', $string) === 1) { + if (preg_match('/([^\x{09}\x{0A}\x{0D}\x{20}-\x{7F}])/u', $string)) { // Base64 encode. $string = base64_encode($string); // Show the characterset and the transfer-encoding for header strings. if ($with_charset) { - $string = '=?' . $charset . '?B?' . $string . '?='; + $string = '=?UTF-8?B?' . $string . '?='; } // Break it up in lines (mail body). @@ -721,10 +658,10 @@ function ($m) { $string = chunk_split($string, 76, $line_break); } - return [$charset, $string, 'base64']; + return ['UTF-8', $string, 'base64']; } - return [$charset, $string, '7bit']; + return ['UTF-8', $string, '7bit']; } /** diff --git a/Sources/Msg.php b/Sources/Msg.php index 741947e006..26fcb8bba1 100644 --- a/Sources/Msg.php +++ b/Sources/Msg.php @@ -463,7 +463,7 @@ public function format(int $counter = 0, array $format_options = []): array $this->formatted['preview'] = strip_tags(strtr($this->formatted['body'], ['
' => ' '])); if (Utils::entityStrlen($this->formatted['preview']) > 128) { - $this->formatted['preview'] = Utils::entitySubstr($this->formatted['preview'], 0, 128) . (!empty(Utils::$context['utf8']) ? '…' : '...'); + $this->formatted['preview'] = Utils::entitySubstr($this->formatted['preview'], 0, 128) . '…'; } } @@ -627,14 +627,8 @@ public static function preparsecode(string &$message, bool $previewing = false, ]; $message = strtr($message, $control_replacements); - // This line makes all languages *theoretically* work even with the wrong charset ;). - if (empty(Utils::$context['utf8'])) { - $message = preg_replace('~&#(\d{4,5}|[2-9]\d{2,4}|1[2-9]\d);~', '&#$1;', $message); - } // Normalize Unicode characters for storage efficiency, better searching, etc. - else { - $message = Utils::normalize($message); - } + $message = Utils::normalize($message); // Clean out any other funky stuff. $message = Utils::sanitizeChars($message, 0); @@ -719,9 +713,6 @@ function ($a) { $message = implode('', $parts); - // The regular expression non breaking space has many versions. - $non_breaking_space = Utils::$context['utf8'] ? '\x{A0}' : '\xA0'; - // Autolink any plain-text URLs. if (!empty($autolink)) { $message = Autolinker::load()->makeLinks($message); @@ -833,25 +824,25 @@ function ($m) { $mistake_fixes = [ // Find [table]s not followed by [tr]. - '~\[table\](?![\s' . $non_breaking_space . ']*\[tr\])~s' . (Utils::$context['utf8'] ? 'u' : '') => '[table][tr]', + '~\[table\](?![\s\x{A0}]*\[tr\])~su' => '[table][tr]', // Find [tr]s not followed by [td]. - '~\[tr\](?![\s' . $non_breaking_space . ']*\[td\])~s' . (Utils::$context['utf8'] ? 'u' : '') => '[tr][td]', + '~\[tr\](?![\s\x{A0}]*\[td\])~su' => '[tr][td]', // Find [/td]s not followed by something valid. - '~\[/td\](?![\s' . $non_breaking_space . ']*(?:\[td\]|\[/tr\]|\[/table\]))~s' . (Utils::$context['utf8'] ? 'u' : '') => '[/td][/tr]', + '~\[/td\](?![\s\x{A0}]*(?:\[td\]|\[/tr\]|\[/table\]))~su' => '[/td][/tr]', // Find [/tr]s not followed by something valid. - '~\[/tr\](?![\s' . $non_breaking_space . ']*(?:\[tr\]|\[/table\]))~s' . (Utils::$context['utf8'] ? 'u' : '') => '[/tr][/table]', + '~\[/tr\](?![\s\x{A0}]*(?:\[tr\]|\[/table\]))~su' => '[/tr][/table]', // Find [/td]s incorrectly followed by [/table]. - '~\[/td\][\s' . $non_breaking_space . ']*\[/table\]~s' . (Utils::$context['utf8'] ? 'u' : '') => '[/td][/tr][/table]', + '~\[/td\][\s\x{A0}]*\[/table\]~su' => '[/td][/tr][/table]', // Find [table]s, [tr]s, and [/td]s (possibly correctly) followed by [td]. - '~\[(table|tr|/td)\]([\s' . $non_breaking_space . ']*)\[td\]~s' . (Utils::$context['utf8'] ? 'u' : '') => '[$1]$2[_td_]', + '~\[(table|tr|/td)\]([\s\x{A0}]*)\[td\]~su' => '[$1]$2[_td_]', // Now, any [td]s left should have a [tr] before them. '~\[td\]~s' => '[tr][td]', // Look for [tr]s which are correctly placed. - '~\[(table|/tr)\]([\s' . $non_breaking_space . ']*)\[tr\]~s' . (Utils::$context['utf8'] ? 'u' : '') => '[$1]$2[_tr_]', + '~\[(table|/tr)\]([\s\x{A0}]*)\[tr\]~su' => '[$1]$2[_tr_]', // Any remaining [tr]s should have a [table] before them. '~\[tr\]~s' => '[table][tr]', // Look for [/td]s followed by [/tr]. - '~\[/td\]([\s' . $non_breaking_space . ']*)\[/tr\]~s' . (Utils::$context['utf8'] ? 'u' : '') => '[/td]$1[_/tr_]', + '~\[/td\]([\s\x{A0}]*)\[/tr\]~su' => '[/td]$1[_/tr_]', // Any remaining [/tr]s should have a [/td]. '~\[/tr\]~s' => '[/td][/tr]', // Look for properly opened [li]s which aren't closed. @@ -859,14 +850,14 @@ function ($m) { '~\[li\]([^\[\]]+?)\[/list\]~s' => '[_li_]$1[_/li_][/list]', '~\[li\]([^\[\]]+?)$~s' => '[li]$1[/li]', // Lists - find correctly closed items/lists. - '~\[/li\]([\s' . $non_breaking_space . ']*)\[/list\]~s' . (Utils::$context['utf8'] ? 'u' : '') => '[_/li_]$1[/list]', + '~\[/li\]([\s\x{A0}]*)\[/list\]~su' => '[_/li_]$1[/list]', // Find list items closed and then opened. - '~\[/li\]([\s' . $non_breaking_space . ']*)\[li\]~s' . (Utils::$context['utf8'] ? 'u' : '') => '[_/li_]$1[_li_]', + '~\[/li\]([\s\x{A0}]*)\[li\]~su' => '[_/li_]$1[_li_]', // Now, find any [list]s or [/li]s followed by [li]. - '~\[(list(?: [^\]]*?)?|/li)\]([\s' . $non_breaking_space . ']*)\[li\]~s' . (Utils::$context['utf8'] ? 'u' : '') => '[$1]$2[_li_]', + '~\[(list(?: [^\]]*?)?|/li)\]([\s\x{A0}]*)\[li\]~su' => '[$1]$2[_li_]', // Allow for sub lists. - '~\[/li\]([\s' . $non_breaking_space . ']*)\[list\]~' . (Utils::$context['utf8'] ? 'u' : '') => '[_/li_]$1[list]', - '~\[/list\]([\s' . $non_breaking_space . ']*)\[li\]~' . (Utils::$context['utf8'] ? 'u' : '') => '[/list]$1[_li_]', + '~\[/li\]([\s\x{A0}]*)\[list\]~u' => '[_/li_]$1[list]', + '~\[/list\]([\s\x{A0}]*)\[li\]~u' => '[/list]$1[_li_]', // Any remaining [li]s weren't inside a [list]. '~\[li\]~' => '[list][li]', // Any remaining [/li]s weren't before a [/list]. @@ -908,9 +899,9 @@ function ($m) { // Restore white space entities if (!$previewing) { - $message = strtr($message, [' ' => '  ', "\n" => '
', Utils::$context['utf8'] ? "\xC2\xA0" : "\xA0" => ' ']); + $message = strtr($message, [' ' => '  ', "\n" => '
', "\u{A0}" => ' ']); } else { - $message = strtr($message, [' ' => '  ', Utils::$context['utf8'] ? "\xC2\xA0" : "\xA0" => ' ']); + $message = strtr($message, [' ' => '  ', "\u{A0}" => ' ']); } // Now let's quickly clean up things that will slow our parser (which are common in posted code.) diff --git a/Sources/Parser.php b/Sources/Parser.php index 41c24c05b7..e7c849b5ce 100644 --- a/Sources/Parser.php +++ b/Sources/Parser.php @@ -206,13 +206,6 @@ abstract class Parser */ public static string $smileys_url; - /** - * @var string - * - * The character encoding of the strings to be parsed. - */ - public static string $encoding; - /** * @var string * @@ -540,7 +533,6 @@ protected static function setStaticVars(): void self::$time_format = self::$time_format ?? User::$me->time_format ?? Time::getTimeFormat(); self::$locale = self::$locale ?? Lang::$txt['lang_locale'] ?? ''; - self::$encoding = self::$encoding ?? (!empty(Utils::$context['utf8']) ? 'UTF-8' : (!empty(Config::$modSettings['global_character_set']) ? Config::$modSettings['global_character_set'] : (!empty(Lang::$txt['lang_character_set']) ? Lang::$txt['lang_character_set'] : 'UTF-8'))); // Smiley settings. self::$custom_smileys_enabled = self::$custom_smileys_enabled ?? !empty(Config::$modSettings['smiley_enable']); @@ -699,7 +691,6 @@ protected static function getCacheKey(string $string, int $input_types, int $out $output_type, $options, // Localization settings. - self::$encoding, self::$locale, self::$time_offset, self::$time_format, diff --git a/Sources/Parsers/SmileyParser.php b/Sources/Parsers/SmileyParser.php index fb44f35ab1..5de08c2a0f 100644 --- a/Sources/Parsers/SmileyParser.php +++ b/Sources/Parsers/SmileyParser.php @@ -75,9 +75,6 @@ public function __construct() if (self::$smiley_set !== 'none') { $data = self::loadData(self::$smiley_set); - // The non-breaking-space is a complex thing... - $non_breaking_space = self::$encoding === 'UTF-8' ? '\x{A0}' : '\xA0'; - $this->smiley_preg_replacements = []; $search_parts = []; $smileys_path = Utils::htmlspecialchars(self::$smileys_url . '/' . rawurlencode(self::$smiley_set) . '/'); @@ -106,7 +103,7 @@ public function __construct() } // This smiley regex makes sure it doesn't parse smileys within code tags (so [url=mailto:David@bla.com] doesn't parse the :D smiley) - $this->smiley_preg_search = '~(?<=[>:\?\.\s' . $non_breaking_space . '[\]()*\\\;]|(?smiley_preg_search = '~(?<=[>:\?\.\s\x{A0}[\]()*\\\;]|(?params['search'], $matches, PREG_PATTERN_ORDER); + preg_match_all('~(?:^|\s)([-]?)"([^"]+)"(?:$|\s)~u', $this->params['search'], $matches, PREG_PATTERN_ORDER); $searchArray = $matches[2]; // Remove the phrase parts and extract the words. - $tempSearch = explode(' ', preg_replace('~(?:^|\s)(?:[-]?)"(?:[^"]+)"(?:$|\s)~' . (Utils::$context['utf8'] ? 'u' : ''), ' ', $this->params['search'])); + $tempSearch = explode(' ', preg_replace('~(?:^|\s)(?:[-]?)"(?:[^"]+)"(?:$|\s)~u', ' ', $this->params['search'])); // A minus sign in front of a word excludes the word.... so... $excludedWords = []; diff --git a/Sources/Search/APIs/Parsed.php b/Sources/Search/APIs/Parsed.php index eb864712b1..9da2f546ec 100644 --- a/Sources/Search/APIs/Parsed.php +++ b/Sources/Search/APIs/Parsed.php @@ -399,7 +399,7 @@ public function indexedWordQuery(array $words, array $search_data): mixed // Help SearchResult::highlight() to highlight the matches we actually // found, not just the strings that were originally requested. foreach (array_keys($found) as $word) { - $word = Utils::fixUtf8mb4(Utils::normalize(Utils::entityDecode($word, true), 'c')); + $word = Utils::fixUtf8mb4(Utils::normalize(Utils::entityDecode($word), 'c')); if (!in_array($word, $this->searchArray)) { $this->searchArray[] = $word; @@ -772,7 +772,7 @@ public static function updateStopwordsSetting(): void Db::$db->free_result($request); - $stopwords = array_map(fn($w) => Utils::normalize(Utils::entityDecode($w, true)), $stopwords); + $stopwords = array_map(fn($w) => Utils::normalize(Utils::entityDecode($w)), $stopwords); Config::updateModSettings([ 'search_stopwords_parsed' => implode(',', $stopwords), diff --git a/Sources/Search/SearchApi.php b/Sources/Search/SearchApi.php index ffcaf0831d..6f13115ae7 100644 --- a/Sources/Search/SearchApi.php +++ b/Sources/Search/SearchApi.php @@ -1115,7 +1115,7 @@ protected function setParams(): void protected function setSearchTerms(): void { // Change non-word characters into spaces. - $stripped_query = preg_replace('~(?:[\x0B\0' . (Utils::$context['utf8'] ? '\x{A0}' : '\xA0') . '\t\r\s\n(){}\\[\\]<>!@$%^*.,:+=`\~\?/\\\\]+|&(?:amp|lt|gt|quot);)+~' . (Utils::$context['utf8'] ? 'u' : ''), ' ', $this->params['search']); + $stripped_query = preg_replace('~(?:[\x0B\0\x{A0}\t\r\s\n(){}\\[\\]<>!@$%^*.,:+=`\~\?/\\\\]+|&(?:amp|lt|gt|quot);)+~u', ' ', $this->params['search']); // Fold the case of the query. It's gonna be case insensitive anyway. $stripped_query = Utils::htmlspecialcharsDecode(Utils::casefold($stripped_query)); @@ -1134,7 +1134,7 @@ protected function setSearchTerms(): void $phraseArray = $matches[2]; // Remove the phrase parts and extract the words. - $wordArray = preg_replace('~(?:^|\s)[-]?"[^"]+"(?:$|\s)~' . (Utils::$context['utf8'] ? 'u' : ''), ' ', $this->params['search']); + $wordArray = preg_replace('~(?:^|\s)[-]?"[^"]+"(?:$|\s)~u', ' ', $this->params['search']); $wordArray = explode(' ', Utils::htmlspecialchars(Utils::htmlspecialcharsDecode($wordArray), ENT_QUOTES)); diff --git a/Sources/Search/SearchResult.php b/Sources/Search/SearchResult.php index d1def94548..6eabba56f6 100644 --- a/Sources/Search/SearchResult.php +++ b/Sources/Search/SearchResult.php @@ -240,9 +240,9 @@ public function format(int $counter = 0, array $format_options = []): array $this->body = Utils::htmlspecialcharsDecode(strtr($this->body, [' ' => ' ', '
' => "\n", '[' => '[', ']' => ']', ':' => ':', '@' => '@'])); if (empty(Config::$modSettings['search_method']) || $force_partial_word) { - preg_match_all('/([^\s\W]{' . $charLimit . '}[\s\W]|[\s\W].{0,' . $charLimit . '}?|^)(' . $matchString . ')(.{0,' . $charLimit . '}[\s\W]|[^\s\W]{0,' . $charLimit . '})/is' . (Utils::$context['utf8'] ? 'u' : ''), $this->body, $matches); + preg_match_all('/([^\s\W]{' . $charLimit . '}[\s\W]|[\s\W].{0,' . $charLimit . '}?|^)(' . $matchString . ')(.{0,' . $charLimit . '}[\s\W]|[^\s\W]{0,' . $charLimit . '})/isu', $this->body, $matches); } else { - preg_match_all('/([^\s\W]{' . $charLimit . '}[\s\W]|[\s\W].{0,' . $charLimit . '}?[\s\W]|^)(' . $matchString . ')([\s\W].{0,' . $charLimit . '}[\s\W]|[\s\W][^\s\W]{0,' . $charLimit . '})/is' . (Utils::$context['utf8'] ? 'u' : ''), $this->body, $matches); + preg_match_all('/([^\s\W]{' . $charLimit . '}[\s\W]|[\s\W].{0,' . $charLimit . '}?[\s\W]|^)(' . $matchString . ')([\s\W].{0,' . $charLimit . '}[\s\W]|[\s\W][^\s\W]{0,' . $charLimit . '})/isu', $this->body, $matches); } $this->body = ''; diff --git a/Sources/ServerSideIncludes.php b/Sources/ServerSideIncludes.php index fa90c3d3b3..edbcaa7e5c 100644 --- a/Sources/ServerSideIncludes.php +++ b/Sources/ServerSideIncludes.php @@ -1509,7 +1509,7 @@ public static function login(string $redirect_to = '', string $output_method = ' SecurityToken::create('login'); echo ' -
+ @@ -1582,7 +1582,7 @@ public static function recentPoll(bool $topPollInstead = false, string $output_m if ($return['allow_vote']) { echo ' - + ', $return['question'], '
', !empty($return['allowed_warning']) ? $return['allowed_warning'] . '
' : ''; @@ -1669,7 +1669,7 @@ public static function showPoll(?int $topic = null, string $output_method = 'ech if ($return['allow_vote']) { echo ' - + ', $return['question'], '
', !empty($return['allowed_warning']) ? $return['allowed_warning'] . '
' : ''; @@ -1861,7 +1861,7 @@ public static function quickSearch(string $output_method = 'echo'): ?string } echo ' - + '; @@ -2799,7 +2799,7 @@ public function __construct() // @todo: probably not the best place, but somewhere it should be set... if (!headers_sent()) { - header('content-type: text/html; charset=' . (empty(Config::$modSettings['global_character_set']) ? (empty(Lang::$txt['lang_character_set']) ? 'ISO-8859-1' : Lang::$txt['lang_character_set']) : Config::$modSettings['global_character_set'])); + header('content-type: text/html; charset=UTF-8'); } // Take care of any banning that needs to be done. diff --git a/Sources/Subs-Compat.php b/Sources/Subs-Compat.php index c9a063f943..3f5892c0fe 100644 --- a/Sources/Subs-Compat.php +++ b/Sources/Subs-Compat.php @@ -5243,7 +5243,7 @@ function call_helper(mixed $input, bool $return = false): mixed function replaceEntities__callback(array $matches): string { return strtr( - htmlspecialchars(Utils::entityDecode($matches[1], true), ENT_QUOTES), + htmlspecialchars(Utils::entityDecode($matches[1]), ENT_QUOTES), [ '&' => '&', '"' => '"', @@ -5255,7 +5255,7 @@ function replaceEntities__callback(array $matches): string function fixchar__callback(array $matches): string { - return Utils::entityDecode($matches[0], true); + return Utils::entityDecode($matches[0]); } function entity_fix__callback(array $matches): string @@ -5304,17 +5304,6 @@ function sanitizeMSCutPaste(string $string): string "\xe2\x80\x9d", // right double curly quote, U+201D ]; - // windows 1252 / iso equivalents - $findchars_iso = [ - chr(130), - chr(132), - chr(133), - chr(145), - chr(146), - chr(147), - chr(148), - ]; - // safe replacements $replacechars = [ ',', // ‚ @@ -5326,9 +5315,7 @@ function sanitizeMSCutPaste(string $string): string '"', // ” ]; - $encoding = (!empty(Utils::$context['utf8']) ? 'UTF-8' : (!empty(SMF\Config::$modSettings['global_character_set']) ? SMF\Config::$modSettings['global_character_set'] : (!empty(SMF\Lang::$txt['lang_character_set']) ? SMF\Lang::$txt['lang_character_set'] : 'UTF-8'))); - - $string = str_replace($encoding === 'UTF-8' ? $findchars_utf8 : $findchars_iso, $replacechars, $string); + $string = str_replace($findchars_utf8, $replacechars, $string); return $string; } diff --git a/Sources/Tasks/SendDigests.php b/Sources/Tasks/SendDigests.php index 5b8ba66b4b..263b1643c9 100644 --- a/Sources/Tasks/SendDigests.php +++ b/Sources/Tasks/SendDigests.php @@ -187,7 +187,7 @@ public function execute(): bool $langtxt[$lang] = [ 'subject' => Lang::$txt['digest_subject_' . ($is_weekly ? 'weekly' : 'daily')], - 'char_set' => Lang::$txt['lang_character_set'], + 'char_set' => 'UTF-8', 'intro' => Lang::getTxt('digest_intro_' . ($is_weekly ? 'weekly' : 'daily'), ['forum_name' => Config::$mbname]), 'new_topics' => Lang::$txt['digest_new_topics'], 'topic_lines' => Lang::$txt['digest_new_topics_line'], @@ -223,9 +223,6 @@ public function execute(): bool continue; } - // Right character set! - Utils::$context['character_set'] = empty(Config::$modSettings['global_character_set']) ? $langtxt[$lang]['char_set'] : Config::$modSettings['global_character_set']; - // Do the start stuff! $email = [ 'subject' => Config::$mbname . ' - ' . $langtxt[$lang]['subject'], diff --git a/Sources/Theme.php b/Sources/Theme.php index 1e0d074051..cd3d73f7d4 100644 --- a/Sources/Theme.php +++ b/Sources/Theme.php @@ -261,8 +261,6 @@ public static function loadEssential(): void Lang::load('General+Modifications'); // Just in case it wasn't already set elsewhere. - Utils::$context['character_set'] = empty(Config::$modSettings['global_character_set']) ? Lang::$txt['lang_character_set'] : Config::$modSettings['global_character_set']; - Utils::$context['utf8'] = Utils::$context['character_set'] === 'UTF-8'; Utils::$context['right_to_left'] = !empty(Lang::$txt['lang_rtl']); // Tell ErrorHandler::fatalLang() to not reload the theme. @@ -1287,13 +1285,13 @@ public static function template_header(): void if (!isset($_REQUEST['xml']) && isset($_GET['debug']) && !BrowserDetector::isBrowser('ie')) { header('content-type: application/xhtml+xml'); } elseif (!isset($_REQUEST['xml'])) { - header('content-type: text/html; charset=' . (empty(Utils::$context['character_set']) ? 'ISO-8859-1' : Utils::$context['character_set'])); + header('content-type: text/html; charset=UTF-8'); } } $content_type = Forum::getCurrentAction()?->getOutputType()->getMimeType() ?? 'text/' . (isset($_REQUEST['xml']) ? 'xml' : 'html'); - header('Content-Type: ' . $content_type . '; charset=' . (empty(Utils::$context['character_set']) ? 'ISO-8859-1' : Utils::$context['character_set'])); + header('Content-Type: ' . $content_type . '; charset=UTF-8'); // Collect layers to be added $layers = []; @@ -1305,6 +1303,7 @@ public static function template_header(): void // Add security warning if security issues are detected Utils::$context['warnings'] = Security::checkSecurityFiles(); + if (Utils::$context['warnings']) { $layers[] = 'security_warning'; } @@ -1948,8 +1947,7 @@ protected function initialize(): void User::$me->time_format = Lang::$txt['time_format']; } - // Set the character set from the template. - Utils::$context['character_set'] = empty(Config::$modSettings['global_character_set']) ? Lang::$txt['lang_character_set'] : Config::$modSettings['global_character_set']; + // Set the text direction from the language strings. Utils::$context['right_to_left'] = !empty(Lang::$txt['lang_rtl']); // Guests may still need a name. @@ -2282,7 +2280,7 @@ protected function loadJavaScript(): void 'smf_avatars_url' => '"' . Config::$modSettings['avatar_url'] . '"', 'smf_scripturl' => '"' . Config::$scripturl . '"', 'smf_iso_case_folding' => Sapi::supportsIsoCaseFolding() ? 'true' : 'false', - 'smf_charset' => '"' . Utils::$context['character_set'] . '"', + 'smf_charset' => '"UTF-8"', 'smf_session_id' => '"' . Utils::$context['session_id'] . '"', 'smf_session_var' => '"' . Utils::$context['session_var'] . '"', 'smf_member_id' => User::$me->id, @@ -2432,7 +2430,7 @@ protected static function templateInclude(string $filename, bool $once = false): } if (isset($_GET['debug'])) { - header('content-type: application/xhtml+xml; charset=' . (empty(Utils::$context['character_set']) ? 'ISO-8859-1' : Utils::$context['character_set'])); + header('content-type: application/xhtml+xml; charset=UTF-8'); } // Don't cache error pages!! @@ -2448,11 +2446,7 @@ protected static function templateInclude(string $filename, bool $once = false): } // First, let's get the doctype and language information out of the way. - echo '' . "\n" . '' . "\n\t" . ''; - - if (isset(Utils::$context['character_set'])) { - echo "\n\t\t" . ''; - } + echo '' . "\n" . '' . "\n\t" . '' . "\n\t\t" . ''; if (!empty(Config::$maintenance) && !User::$me->allowedTo('admin_forum')) { echo "\n\t\t" . '', Config::$mtitle, '' . "\n\t" . '' . "\n\t" . '' . "\n\t\t" . '

', Config::$mtitle, '

' . "\n\t\t", Config::$mmessage, "\n\t" . '' . "\n" . ''; diff --git a/Sources/Unicode/SpoofDetector.php b/Sources/Unicode/SpoofDetector.php index 9673717b21..5b15aa8d4e 100644 --- a/Sources/Unicode/SpoofDetector.php +++ b/Sources/Unicode/SpoofDetector.php @@ -42,10 +42,6 @@ class SpoofDetector */ public static function getSkeletonString(string $string): string { - if (empty(Utils::$context['utf8'])) { - return $string; - } - $chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); if ($chars === false) { @@ -80,10 +76,6 @@ public static function getSkeletonString(string $string): string */ public static function resolveScriptSet(string $string): array { - if (empty(Utils::$context['utf8'])) { - return []; - } - $chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); if ($chars === false) { @@ -150,7 +142,7 @@ public static function resolveScriptSet(string $string): array */ public static function enhanceWordCensor(string $text): void { - if (empty(Utils::$context['utf8']) || empty(Config::$modSettings['spoofdetector_censor'])) { + if (empty(Config::$modSettings['spoofdetector_censor'])) { return; } diff --git a/Sources/Unicode/Utf8String.php b/Sources/Unicode/Utf8String.php index ebc9cb8baa..92c6b4984a 100644 --- a/Sources/Unicode/Utf8String.php +++ b/Sources/Unicode/Utf8String.php @@ -662,7 +662,7 @@ public function extractWords(int $level): array $this->string = Utils::sanitizeEntities($this->string, ' '); // Decode all the entities. - $this->string = Utils::entityDecode($this->string, true, ENT_QUOTES | ENT_HTML5, true); + $this->string = Utils::entityDecode($this->string, ENT_QUOTES | ENT_HTML5, true); // Replace unwanted invisible characters with spaces. $this->sanitizeInvisibles($level, ' '); diff --git a/Sources/User.php b/Sources/User.php index b7d07e6653..b9483c801d 100644 --- a/Sources/User.php +++ b/Sources/User.php @@ -3742,7 +3742,7 @@ public static function validateUsername(int $memID, string $username, bool $retu */ public static function isReservedName(string $name, int $current_id_member = 0, bool $is_name = true, bool $fatal = true): bool { - $name = Utils::entityDecode($name, true); + $name = Utils::entityDecode($name); $checkName = Utils::strtolower($name); // Administrators are never restricted ;). diff --git a/Sources/Utils.php b/Sources/Utils.php index c9e99e49f6..44cbf65225 100644 --- a/Sources/Utils.php +++ b/Sources/Utils.php @@ -244,7 +244,7 @@ class Utils * SMF's venerable $context variable, now available as Utils::$context. */ public static $context = [ - // Assume UTF-8 until proven otherwise. + // We always use UTF-8, but some old mods might want to check. 'utf8' => true, 'character_set' => 'UTF-8', // Define a list of icons used across multiple places. @@ -341,63 +341,38 @@ public static function load(): void self::$context['browser_cache'] = '?' . preg_replace('~\W~', '', strtolower(SMF_FULL_VERSION)) . '_' . Config::$modSettings['browser_cache']; } - // UTF-8? - if (isset(Config::$modSettings['global_character_set'])) { - self::$context['character_set'] = Config::$modSettings['global_character_set']; - self::$context['utf8'] = self::$context['character_set'] === 'UTF-8'; - } - // Load up our $context['server'] data for backwards compatibility Sapi::load(); } /** - * Decodes and sanitizes HTML entities. - * - * If database does not support 4-byte UTF-8 characters, entities for 4-byte - * characters are left in place, unless the $mb4 argument is set to true. + * Decodes and sanitizes named and numerical character entities. * * @param string $string The string in which to decode entities. - * @param bool $mb4 If true, always decode 4-byte UTF-8 characters. - * Default: false. * @param int $flags Flags to pass to html_entity_decode. * Default: ENT_QUOTES | ENT_HTML5. * @param bool $nbsp_to_space If true, decode ' ' to space character. * Default: false. * @return string The string with the entities decoded. */ - public static function entityDecode(string $string, bool $mb4 = false, int $flags = ENT_QUOTES | ENT_HTML5, bool $nbsp_to_space = false): string + public static function entityDecode(string $string, int $flags = ENT_QUOTES | ENT_HTML5, bool $nbsp_to_space = false): string { // Don't waste time on empty strings. if (trim($string) === '') { return $string; } - // In theory this is always UTF-8, but... - if (empty(self::$context['character_set'])) { - $charset = is_callable('mb_detect_encoding') ? mb_detect_encoding($string) : 'UTF-8'; - } elseif (str_contains(self::$context['character_set'], 'ISO-8859-') && !in_array(self::$context['character_set'], ['ISO-8859-5', 'ISO-8859-15'])) { - $charset = 'ISO-8859-1'; - } else { - $charset = self::$context['character_set']; - } - // Enables consistency with the behaviour of un_htmlspecialchars. if ($nbsp_to_space) { $string = preg_replace('~' . self::ENT_NBSP . '~u', ' ', $string); } // Do the deed. - $string = html_entity_decode($string, $flags, $charset); + $string = html_entity_decode($string, $flags, 'UTF-8'); // Remove any illegal character entities. $string = self::sanitizeEntities($string); - // Finally, make sure we don't break the database. - if (!$mb4) { - $string = self::fixUtf8mb4($string); - } - return $string; } @@ -479,52 +454,30 @@ function ($matches) use ($substitute) { * 2: Disallow all formatting characters. Use for internal comparisons * only, such as in the word censor, search contexts, etc. * Default: 0. - * @param string|null $substitute Replacement string for the invalid characters. - * If not set, the Unicode replacement character (U+FFFD) will be used - * (or a fallback like "?" if necessary). + * @param string $substitute Replacement string for the invalid characters. + * Default: the Unicode replacement character (U+FFFD). * @return string|false The sanitized string, or false on failure. */ - public static function sanitizeChars(string $string, int $level = 0, ?string $substitute = null): string|false + public static function sanitizeChars(string $string, int $level = 0, string $substitute = "\u{FFFD}"): string|false { $string = (string) $string; $level = min(max((int) $level, 0), 2); - // What substitute character should we use? - if (isset($substitute)) { - $substitute = strval($substitute); - } elseif (!empty(Utils::$context['utf8'])) { - // Raw UTF-8 bytes for U+FFFD. - $substitute = "\xEF\xBF\xBD"; - } elseif (!empty(Utils::$context['character_set']) && is_callable('mb_decode_numericentity')) { - // Get whatever the default replacement character is for this encoding. - $substitute = mb_decode_numericentity('�', [0xFFFD, 0xFFFD, 0, 0xFFFF], Utils::$context['character_set']); - } else { - $substitute = '?'; - } - // Fix any invalid byte sequences. - if (!empty(Utils::$context['character_set'])) { - // For UTF-8, this preg_match test is much faster than mb_check_encoding. - $malformed = !empty(Utils::$context['utf8']) ? @preg_match('//u', $string) === false && preg_last_error() === PREG_BAD_UTF8_ERROR : (!is_callable('mb_check_encoding') || !mb_check_encoding($string, Utils::$context['character_set'])); - - if ($malformed) { - // mb_convert_encoding will replace invalid byte sequences with our substitute. - if (is_callable('mb_convert_encoding')) { - if (!is_callable('mb_ord')) { - require_once Config::$sourcedir . '/Subs-Compat.php'; - } - - $substitute_ord = $substitute === '' ? 'none' : mb_ord($substitute, Utils::$context['character_set']); + // For UTF-8, this preg_match test is much faster than mb_check_encoding. + if (@preg_match('//u', $string) === false && preg_last_error() === PREG_BAD_UTF8_ERROR) { + // mb_convert_encoding will replace invalid byte sequences with our substitute. + if (is_callable('mb_convert_encoding')) { + $substitute_ord = $substitute === '' ? 'none' : mb_ord($substitute, 'UTF-8'); - $mb_substitute_character = mb_substitute_character(); - mb_substitute_character($substitute_ord); + $mb_substitute_character = mb_substitute_character(); + mb_substitute_character($substitute_ord); - $string = mb_convert_encoding($string, Utils::$context['character_set'], Utils::$context['character_set']); + $string = mb_convert_encoding($string, 'UTF-8', 'UTF-8'); - mb_substitute_character($mb_substitute_character); - } else { - return false; - } + mb_substitute_character($mb_substitute_character); + } else { + return false; } } @@ -532,11 +485,7 @@ public static function sanitizeChars(string $string, int $level = 0, ?string $su $string = Utils::normalizeSpaces($string, true); // Deal with unwanted control characters, invisible formatting characters, and other creepy-crawlies. - if (!empty(Utils::$context['utf8'])) { - $string = (string) Unicode\Utf8String::create($string)->sanitizeInvisibles($level, $substitute); - } else { - $string = preg_replace('/[^\P{Cc}\t\r\n]/', $substitute, $string); - } + $string = (string) Unicode\Utf8String::create($string)->sanitizeInvisibles($level, $substitute); return $string; } @@ -575,13 +524,13 @@ public static function normalizeSpaces(string $string, bool $vspace = true, bool if ($vspace) { // \R is like \v, except it handles "\r\n" as a single unit. - $patterns[] = '/\R/' . (Utils::$context['utf8'] ? 'u' : ''); + $patterns[] = '/\R/u'; $replacements[] = $options['no_breaks'] ? ' ' : "\n"; } if ($hspace) { // Interesting fact: Unicode properties like \p{Zs} work even when not in UTF-8 mode. - $patterns[] = '/' . ($options['replace_tabs'] ? '\h' : '\p{Zs}') . ($options['collapse_hspace'] ? '+' : '') . '/' . (Utils::$context['utf8'] ? 'u' : ''); + $patterns[] = '/' . ($options['replace_tabs'] ? '\h' : '\p{Zs}') . ($options['collapse_hspace'] ? '+' : '') . '/u'; $replacements[] = ' '; } @@ -589,8 +538,8 @@ public static function normalizeSpaces(string $string, bool $vspace = true, bool } /** - * Wrapper for standard htmlspecialchars() that ensures the output respects - * the database's support (or lack thereof) for four-byte UTF-8 characters. + * Wrapper for standard htmlspecialchars() that additionally normalizes and + * sanitizes the string. * * @param string $string The string being converted. * @param int $flags Bitmask of flags to pass to standard htmlspecialchars(). @@ -602,7 +551,7 @@ public static function htmlspecialchars(string $string, int $flags = ENT_COMPAT, { $string = self::normalize($string); - return self::fixUtf8mb4(self::sanitizeEntities(\htmlspecialchars($string, $flags, $encoding))); + return self::sanitizeEntities(\htmlspecialchars($string, $flags, $encoding)); } /** @@ -929,7 +878,7 @@ public static function convertCase(string $string, string $case, bool $simple = $string = (string) Unicode\Utf8String::create($string)->convertCase($case, $simple)->normalize($form); } - return self::fixUtf8mb4($string); + return $string; } /** @@ -1265,7 +1214,7 @@ public static function cleanXml(string $string): string "\x0B", "\x0C", "\x0E", "\x0F", "\x10", "\x11", "\x12", "\x13", "\x14", "\x15", "\x16", "\x17", "\x18", "\x19", "\x1A", "\x1B", "\x1C", "\x1D", "\x1E", "\x1F", - // Remove \xFFFE and \xFFFF + // Remove U-FFFE and U-FFFF "\xEF\xBF\xBE", "\xEF\xBF\xBF", ]; @@ -1273,8 +1222,8 @@ public static function cleanXml(string $string): string // The Unicode surrogate pair code points should never be present in our // strings to begin with, but if any snuck in, they need to be removed. - if (!empty(Utils::$context['utf8']) && str_contains($string, "\xED")) { - $string = preg_replace('/\xED[\xA0-\xBF][\x80-\xBF]/', '', $string); + if (str_contains($string, "\xED")) { + $string = preg_replace('/[\x{D800}-\x{DFFF}]/u', '', $string); } return $string; @@ -2037,11 +1986,8 @@ public static function emitFile(\ArrayAccess|array $file, bool $show_thumb = fal $file['filename'] = hash_hmac('md5', var_export($file, true), Config::$image_proxy_secret) . '.' . ltrim($file['fileext'] ?? 'dat', '.'); } - // Convert the filename to UTF-8, cuz most browsers dig that. - $file['filename'] = !self::$context['utf8'] ? mb_convert_encoding($file['filename'], 'UTF-8', self::$context['character_set']) : $file['filename']; - - // Also provide a plain ASCII name for the sake of old browsers. - $file['asciiname'] = preg_replace('/[\x{80}-\x{10FFFF}]+/u', '?', Utils::entityDecode($file['filename'], true)); + // Provide a plain ASCII name for the sake of old browsers. + $file['asciiname'] = preg_replace('/[\x{80}-\x{10FFFF}]+/u', '?', Utils::entityDecode($file['filename'])); // Replace ASCII names like ??????.jpg with something more unique. if (strspn($file['asciiname'], '?') === strpos($file['asciiname'], '.')) { diff --git a/Themes/default/Admin.template.php b/Themes/default/Admin.template.php index 1f4e0479c0..a26a995e61 100644 --- a/Themes/default/Admin.template.php +++ b/Themes/default/Admin.template.php @@ -607,7 +607,7 @@ function template_edit_censored() // First section is for adding/removing words from the censored list. echo ' -
+

', Lang::$txt['admin_censored_words'], ' @@ -717,7 +717,7 @@ function template_not_done()

'; echo ' - '; + '; // Do we have a token? if (isset(Utils::$context['not_done_token']) && isset(Utils::$context[Utils::$context['not_done_token'] . '_token'], Utils::$context[Utils::$context['not_done_token'] . '_token_var'])) @@ -768,7 +768,7 @@ function template_show_settings() echo Utils::$context['settings_insert_above']; echo ' - '; + '; // Is there a custom title? if (isset(Utils::$context['settings_title'])) @@ -1147,7 +1147,7 @@ function template_edit_profile_field() } echo ' - +

', Utils::$context['page_title'], '

@@ -1667,7 +1667,7 @@ function template_clean_cache_button_below()

', Lang::$txt['maintain_cache'], '

- +

', Lang::$txt['maintain_cache_info'], '

@@ -1683,7 +1683,7 @@ function template_admin_quick_search() { if (User::$me->is_admin) echo ' - + '; if (!empty($calendar_data['end_date'])) @@ -844,7 +844,7 @@ function template_calendar_top($calendar_data) function template_event_post() { echo ' - '; + '; if (!empty(Utils::$context['event']->new)) echo ' diff --git a/Themes/default/Display.template.php b/Themes/default/Display.template.php index 380e70d06d..5c696b03e0 100644 --- a/Themes/default/Display.template.php +++ b/Themes/default/Display.template.php @@ -132,7 +132,7 @@ function template_main() else { echo ' - '; + '; // Show a warning if they are allowed more than one option. if (Utils::$context['poll']['allowed_warning']) @@ -207,7 +207,7 @@ function template_main() // Show the topic information - icon, subject, etc. echo '
- '; + '; Utils::$context['ignoredMsgs'] = array(); Utils::$context['removableMessageIDs'] = array(); @@ -898,7 +898,7 @@ function template_quickreply()

', Lang::$txt['wait_for_approval'], '

'; echo ' - + @@ -986,7 +986,7 @@ function insertQuoteFast(messageid) if (Utils::$context['show_spellchecking']) echo ' - + '; diff --git a/Themes/default/Errors.template.php b/Themes/default/Errors.template.php index 91b892ed45..ef0867c87d 100644 --- a/Themes/default/Errors.template.php +++ b/Themes/default/Errors.template.php @@ -64,7 +64,7 @@ function template_fatal_error() function template_error_log() { echo ' -
+

', Lang::$txt['errorlog'], ' @@ -204,7 +204,7 @@ function template_show_file() echo ' - + ', Utils::$context['file_data']['file'], ' ', Theme::template_css(), ' @@ -265,7 +265,7 @@ function template_show_backtrace() echo ' - + ', Lang::$txt['backtrace_title'], ''; Theme::template_css(); diff --git a/Themes/default/GenericList.template.php b/Themes/default/GenericList.template.php index 19cd43dad2..2d75c3cab8 100644 --- a/Themes/default/GenericList.template.php +++ b/Themes/default/GenericList.template.php @@ -29,7 +29,7 @@ function template_show_list($list_id = null) if (isset($cur_list['form'])) echo ' - '; + '; // Show the title of the table (if any). if (!empty($cur_list['title'])) diff --git a/Themes/default/Help.template.php b/Themes/default/Help.template.php index 4576d035ef..b04d9763d0 100644 --- a/Themes/default/Help.template.php +++ b/Themes/default/Help.template.php @@ -24,7 +24,7 @@ function template_popup() echo ' - + ', Utils::$context['page_title'], ' ', Theme::template_css(), ' diff --git a/Themes/default/Likes.template.php b/Themes/default/Likes.template.php index 77e2a8d169..eb46d9cfa7 100644 --- a/Themes/default/Likes.template.php +++ b/Themes/default/Likes.template.php @@ -24,7 +24,7 @@ function template_popup() echo ' - + ', Utils::$context['page_title'], ' ', Theme::template_css(), ' diff --git a/Themes/default/Login.template.php b/Themes/default/Login.template.php index bcf1da088c..ad5e7d59a1 100644 --- a/Themes/default/Login.template.php +++ b/Themes/default/Login.template.php @@ -28,7 +28,7 @@ function template_login()

- '; + '; // Did they make a mistake last time? if (!empty(Utils::$context['login_errors'])) @@ -290,7 +290,7 @@ function template_kick_guest() { // This isn't that much... just like normal login but with a message at the top. echo ' - +