######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Universal charset detector code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 2001
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
# Shy Shalom - original C code
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, see
# <https://www.gnu.org/licenses/>.
######################### END LICENSE BLOCK #########################
import logging
import re
from typing import Optional, Union
from .enums import EncodingEra, LanguageFilter, ProbingState
from .metadata.charsets import Charset, get_charset
INTERNATIONAL_WORDS_PATTERN = re.compile(
# Pattern rationale (see paper section 4.7 Two-Char Sequence Distribution):
# We drop words composed solely of ASCII letters for scripts without Latin letters,
# retaining any word containing at least one high-byte (>=0x80) character.
# Structure: optional ASCII prefix + one or more high-byte chars + optional ASCII
# suffix + optional single trailing marker.
b"[a-zA-Z]*[\x80-\xff]+[a-zA-Z]*[^a-zA-Z\x80-\xff]?"
)
[docs]
class CharSetProber:
SHORTCUT_THRESHOLD = 0.95
def __init__(
self,
*,
lang_filter: LanguageFilter = LanguageFilter.ALL,
encoding_era: EncodingEra = EncodingEra.ALL,
) -> None:
self._state = ProbingState.DETECTING
self.active = True
self.lang_filter = lang_filter
self.encoding_era = encoding_era
self.logger = logging.getLogger(__name__)
[docs]
def reset(self) -> None:
self._state = ProbingState.DETECTING
@property
def charset_name(self) -> Optional[str]:
return None
@property
def charset(self) -> Optional[Charset]:
"""Return the Charset metadata for this prober's encoding."""
name = self.charset_name
if name is None:
return None
return get_charset(name)
@property
def language(self) -> Optional[str]:
raise NotImplementedError
[docs]
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
raise NotImplementedError
@property
def state(self) -> ProbingState:
return self._state
[docs]
def get_confidence(self) -> float:
return 0.0
[docs]
@staticmethod
def filter_high_byte_only(buf: Union[bytes, bytearray]) -> bytes:
buf = re.sub(b"([\x00-\x7f])+", b" ", buf)
return buf
[docs]
@staticmethod
def filter_international_words(buf: Union[bytes, bytearray]) -> bytearray:
"""Filter out ASCII-only words for non-Latin scripts.
Byte classes:
- alphabet: ASCII letters [a-zA-Z]
- international: bytes with high bit set [\x80-\xff]
- marker: everything else [^a-zA-Z\x80-\xff]
The buffer is treated as a sequence of "words" separated by marker bytes.
We KEEP only those words that contain at least one high-byte character,
i.e. match the pattern: optional ASCII prefix + >=1 high-byte + optional
ASCII suffix, plus at most one trailing marker. Pure ASCII words are
discarded as noise when the target language model excludes ASCII letters
("English words in other-language pages" — paper §4.7 summary).
Why we retain surrounding ASCII letters instead of stripping them:
- Preserves real adjacency for bigram modeling around high-byte letters.
- Avoids creating artificial bigrams between non-adjacent high-byte chars.
Trailing marker normalization: a single marker at word end is converted
to a space if it is an ASCII punctuation/control, collapsing runs of
markers into one delimiter (reduces noise like repeated punctuation or
HTML artifacts).
Usage is conditional: callers apply this ONLY when the language model's
``keep_ascii_letters`` is False (see ``SingleByteCharSetProber.feed``).
Latin-script languages skip this and instead use ``remove_xml_tags``.
This behavior mirrors the original universalchardet / uchardet approach
and aligns with the training pipeline which excludes ASCII letters for
non-Latin alphabets.
"""
filtered = bytearray()
# This regex expression filters out only words that have at-least one
# international character. The word may include one marker character at
# the end.
words = INTERNATIONAL_WORDS_PATTERN.findall(buf)
for word in words:
filtered.extend(word[:-1])
# If the last character in the word is a marker, replace it with a
# space as markers shouldn't affect our analysis (they are used
# similarly across all languages and may thus have similar
# frequencies).
last_char = word[-1:]
if not last_char.isalpha() and last_char < b"\x80":
last_char = b" "
filtered.extend(last_char)
return filtered