Source code for chardet.charsetprober

######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Universal charset detector code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 2001
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
#   Mark Pilgrim - port to Python
#   Shy Shalom - original C code
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, see
# <https://www.gnu.org/licenses/>.
######################### END LICENSE BLOCK #########################

import logging
import re
from typing import Optional, Union

from .enums import EncodingEra, LanguageFilter, ProbingState
from .metadata.charsets import Charset, get_charset

INTERNATIONAL_WORDS_PATTERN = re.compile(
    # Pattern rationale (see paper section 4.7 Two-Char Sequence Distribution):
    # We drop words composed solely of ASCII letters for scripts without Latin letters,
    # retaining any word containing at least one high-byte (>=0x80) character.
    # Structure: optional ASCII prefix + one or more high-byte chars + optional ASCII
    # suffix + optional single trailing marker.
    b"[a-zA-Z]*[\x80-\xff]+[a-zA-Z]*[^a-zA-Z\x80-\xff]?"
)


[docs] class CharSetProber: SHORTCUT_THRESHOLD = 0.95 def __init__( self, *, lang_filter: LanguageFilter = LanguageFilter.ALL, encoding_era: EncodingEra = EncodingEra.ALL, ) -> None: self._state = ProbingState.DETECTING self.active = True self.lang_filter = lang_filter self.encoding_era = encoding_era self.logger = logging.getLogger(__name__)
[docs] def reset(self) -> None: self._state = ProbingState.DETECTING
@property def charset_name(self) -> Optional[str]: return None @property def charset(self) -> Optional[Charset]: """Return the Charset metadata for this prober's encoding.""" name = self.charset_name if name is None: return None return get_charset(name) @property def language(self) -> Optional[str]: raise NotImplementedError
[docs] def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: raise NotImplementedError
@property def state(self) -> ProbingState: return self._state
[docs] def get_confidence(self) -> float: return 0.0
[docs] @staticmethod def filter_high_byte_only(buf: Union[bytes, bytearray]) -> bytes: buf = re.sub(b"([\x00-\x7f])+", b" ", buf) return buf
[docs] @staticmethod def filter_international_words(buf: Union[bytes, bytearray]) -> bytearray: """Filter out ASCII-only words for non-Latin scripts. Byte classes: - alphabet: ASCII letters [a-zA-Z] - international: bytes with high bit set [\x80-\xff] - marker: everything else [^a-zA-Z\x80-\xff] The buffer is treated as a sequence of "words" separated by marker bytes. We KEEP only those words that contain at least one high-byte character, i.e. match the pattern: optional ASCII prefix + >=1 high-byte + optional ASCII suffix, plus at most one trailing marker. Pure ASCII words are discarded as noise when the target language model excludes ASCII letters ("English words in other-language pages" — paper §4.7 summary). Why we retain surrounding ASCII letters instead of stripping them: - Preserves real adjacency for bigram modeling around high-byte letters. - Avoids creating artificial bigrams between non-adjacent high-byte chars. Trailing marker normalization: a single marker at word end is converted to a space if it is an ASCII punctuation/control, collapsing runs of markers into one delimiter (reduces noise like repeated punctuation or HTML artifacts). Usage is conditional: callers apply this ONLY when the language model's ``keep_ascii_letters`` is False (see ``SingleByteCharSetProber.feed``). Latin-script languages skip this and instead use ``remove_xml_tags``. This behavior mirrors the original universalchardet / uchardet approach and aligns with the training pipeline which excludes ASCII letters for non-Latin alphabets. """ filtered = bytearray() # This regex expression filters out only words that have at-least one # international character. The word may include one marker character at # the end. words = INTERNATIONAL_WORDS_PATTERN.findall(buf) for word in words: filtered.extend(word[:-1]) # If the last character in the word is a marker, replace it with a # space as markers shouldn't affect our analysis (they are used # similarly across all languages and may thus have similar # frequencies). last_char = word[-1:] if not last_char.isalpha() and last_char < b"\x80": last_char = b" " filtered.extend(last_char) return filtered
[docs] @staticmethod def remove_xml_tags(buf: Union[bytes, bytearray]) -> bytearray: """ Returns a copy of ``buf`` that retains only the sequences of English alphabet and high byte characters that are not between <> characters. This filter can be applied to all scripts which contain both English characters and extended ASCII characters, but is currently only used by ``Latin1Prober``. """ filtered = bytearray() in_tag = False prev = 0 buf_view = memoryview(buf).cast("c") for curr, buf_char in enumerate(buf_view): # Check if we're coming out of or entering an XML tag # https://github.com/python/typeshed/issues/8182 if buf_char == b">": # type: ignore[comparison-overlap] prev = curr + 1 in_tag = False # https://github.com/python/typeshed/issues/8182 elif buf_char == b"<": # type: ignore[comparison-overlap] if curr > prev and not in_tag: # Keep everything after last non-extended-ASCII, # non-alphabetic character filtered.extend(buf[prev:curr]) # Output a space to delimit stretch we kept filtered.extend(b" ") in_tag = True # If we're not in a tag... if not in_tag: # Keep everything after last non-extended-ASCII, non-alphabetic # character filtered.extend(buf[prev:]) return filtered