######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Universal charset detector code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 2001
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
# Shy Shalom - original C code
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, see
# <https://www.gnu.org/licenses/>.
######################### END LICENSE BLOCK #########################
import re
from typing import Union
from .charsetgroupprober import CharSetGroupProber
from .enums import EncodingEra, LanguageFilter, ProbingState
from .hebrewprober import HebrewProber
from .langarabicmodel import (
CP720_ARABIC_MODEL,
CP864_ARABIC_MODEL,
ISO_8859_6_ARABIC_MODEL,
WINDOWS_1256_ARABIC_MODEL,
)
from .langbelarusianmodel import (
CP866_BELARUSIAN_MODEL,
ISO_8859_5_BELARUSIAN_MODEL,
MACCYRILLIC_BELARUSIAN_MODEL,
WINDOWS_1251_BELARUSIAN_MODEL,
)
from .langbretonmodel import (
CP037_BRETON_MODEL,
CP500_BRETON_MODEL,
ISO_8859_14_BRETON_MODEL,
)
from .langbulgarianmodel import (
CP855_BULGARIAN_MODEL,
ISO_8859_5_BULGARIAN_MODEL,
MACCYRILLIC_BULGARIAN_MODEL,
WINDOWS_1251_BULGARIAN_MODEL,
)
from .langcroatianmodel import (
CP852_CROATIAN_MODEL,
ISO_8859_2_CROATIAN_MODEL,
ISO_8859_16_CROATIAN_MODEL,
MACLATIN2_CROATIAN_MODEL,
WINDOWS_1250_CROATIAN_MODEL,
)
from .langczechmodel import (
ISO_8859_2_CZECH_MODEL,
WINDOWS_1250_CZECH_MODEL,
)
from .langdanishmodel import (
CP037_DANISH_MODEL,
CP500_DANISH_MODEL,
CP850_DANISH_MODEL,
CP858_DANISH_MODEL,
CP865_DANISH_MODEL,
ISO_8859_1_DANISH_MODEL,
ISO_8859_15_DANISH_MODEL,
MACROMAN_DANISH_MODEL,
WINDOWS_1252_DANISH_MODEL,
)
from .langdutchmodel import (
CP037_DUTCH_MODEL,
CP500_DUTCH_MODEL,
CP850_DUTCH_MODEL,
CP858_DUTCH_MODEL,
ISO_8859_1_DUTCH_MODEL,
ISO_8859_15_DUTCH_MODEL,
MACROMAN_DUTCH_MODEL,
WINDOWS_1252_DUTCH_MODEL,
)
from .langenglishmodel import (
CP037_ENGLISH_MODEL,
CP437_ENGLISH_MODEL,
CP500_ENGLISH_MODEL,
CP850_ENGLISH_MODEL,
CP858_ENGLISH_MODEL,
ISO_8859_1_ENGLISH_MODEL,
ISO_8859_15_ENGLISH_MODEL,
MACROMAN_ENGLISH_MODEL,
WINDOWS_1252_ENGLISH_MODEL,
)
from .langesperantomodel import ISO_8859_3_ESPERANTO_MODEL
from .langestonianmodel import (
CP775_ESTONIAN_MODEL,
ISO_8859_4_ESTONIAN_MODEL,
ISO_8859_13_ESTONIAN_MODEL,
WINDOWS_1257_ESTONIAN_MODEL,
)
from .langfarsimodel import (
ISO_8859_6_FARSI_MODEL,
WINDOWS_1256_FARSI_MODEL,
)
from .langfinnishmodel import (
CP037_FINNISH_MODEL,
CP500_FINNISH_MODEL,
CP850_FINNISH_MODEL,
CP858_FINNISH_MODEL,
ISO_8859_1_FINNISH_MODEL,
ISO_8859_15_FINNISH_MODEL,
MACROMAN_FINNISH_MODEL,
WINDOWS_1252_FINNISH_MODEL,
)
from .langfrenchmodel import (
CP037_FRENCH_MODEL,
CP500_FRENCH_MODEL,
CP850_FRENCH_MODEL,
CP858_FRENCH_MODEL,
CP863_FRENCH_MODEL,
ISO_8859_1_FRENCH_MODEL,
ISO_8859_15_FRENCH_MODEL,
MACROMAN_FRENCH_MODEL,
WINDOWS_1252_FRENCH_MODEL,
)
from .langgermanmodel import (
CP037_GERMAN_MODEL,
CP500_GERMAN_MODEL,
CP850_GERMAN_MODEL,
CP858_GERMAN_MODEL,
ISO_8859_1_GERMAN_MODEL,
ISO_8859_15_GERMAN_MODEL,
MACROMAN_GERMAN_MODEL,
WINDOWS_1252_GERMAN_MODEL,
)
from .langgreekmodel import (
CP737_GREEK_MODEL,
CP869_GREEK_MODEL,
CP875_GREEK_MODEL,
ISO_8859_7_GREEK_MODEL,
MACGREEK_GREEK_MODEL,
WINDOWS_1253_GREEK_MODEL,
)
from .langhebrewmodel import (
CP424_HEBREW_MODEL,
CP856_HEBREW_MODEL,
CP862_HEBREW_MODEL,
ISO_8859_8_HEBREW_MODEL,
WINDOWS_1255_HEBREW_MODEL,
)
from .langhungarianmodel import (
CP852_HUNGARIAN_MODEL,
ISO_8859_2_HUNGARIAN_MODEL,
ISO_8859_16_HUNGARIAN_MODEL,
MACLATIN2_HUNGARIAN_MODEL,
WINDOWS_1250_HUNGARIAN_MODEL,
)
from .langicelandicmodel import (
CP037_ICELANDIC_MODEL,
CP500_ICELANDIC_MODEL,
CP861_ICELANDIC_MODEL,
ISO_8859_1_ICELANDIC_MODEL,
ISO_8859_10_ICELANDIC_MODEL,
MACICELAND_ICELANDIC_MODEL,
)
from .langindonesianmodel import (
CP037_INDONESIAN_MODEL,
CP500_INDONESIAN_MODEL,
ISO_8859_1_INDONESIAN_MODEL,
MACROMAN_INDONESIAN_MODEL,
WINDOWS_1252_INDONESIAN_MODEL,
)
from .langirishmodel import (
CP037_IRISH_MODEL,
CP500_IRISH_MODEL,
ISO_8859_14_IRISH_MODEL,
)
from .langitalianmodel import (
CP037_ITALIAN_MODEL,
CP500_ITALIAN_MODEL,
CP850_ITALIAN_MODEL,
CP858_ITALIAN_MODEL,
ISO_8859_1_ITALIAN_MODEL,
ISO_8859_15_ITALIAN_MODEL,
MACROMAN_ITALIAN_MODEL,
WINDOWS_1252_ITALIAN_MODEL,
)
from .langkazakhmodel import (
KZ1048_KAZAKH_MODEL,
PTCP154_KAZAKH_MODEL,
)
from .langlatvianmodel import (
CP775_LATVIAN_MODEL,
ISO_8859_4_LATVIAN_MODEL,
ISO_8859_13_LATVIAN_MODEL,
WINDOWS_1257_LATVIAN_MODEL,
)
from .langlithuanianmodel import (
CP775_LITHUANIAN_MODEL,
ISO_8859_4_LITHUANIAN_MODEL,
ISO_8859_13_LITHUANIAN_MODEL,
WINDOWS_1257_LITHUANIAN_MODEL,
)
from .langmacedonianmodel import (
CP855_MACEDONIAN_MODEL,
ISO_8859_5_MACEDONIAN_MODEL,
MACCYRILLIC_MACEDONIAN_MODEL,
WINDOWS_1251_MACEDONIAN_MODEL,
)
from .langmalaymodel import (
CP037_MALAY_MODEL,
CP500_MALAY_MODEL,
ISO_8859_1_MALAY_MODEL,
MACROMAN_MALAY_MODEL,
WINDOWS_1252_MALAY_MODEL,
)
from .langmaltesemodel import ISO_8859_3_MALTESE_MODEL
from .langnorwegianmodel import (
CP037_NORWEGIAN_MODEL,
CP500_NORWEGIAN_MODEL,
CP850_NORWEGIAN_MODEL,
CP858_NORWEGIAN_MODEL,
CP865_NORWEGIAN_MODEL,
ISO_8859_1_NORWEGIAN_MODEL,
ISO_8859_15_NORWEGIAN_MODEL,
MACROMAN_NORWEGIAN_MODEL,
WINDOWS_1252_NORWEGIAN_MODEL,
)
from .langpolishmodel import (
CP852_POLISH_MODEL,
ISO_8859_2_POLISH_MODEL,
ISO_8859_16_POLISH_MODEL,
MACLATIN2_POLISH_MODEL,
WINDOWS_1250_POLISH_MODEL,
)
from .langportuguesemodel import (
CP037_PORTUGUESE_MODEL,
CP500_PORTUGUESE_MODEL,
CP850_PORTUGUESE_MODEL,
CP858_PORTUGUESE_MODEL,
CP860_PORTUGUESE_MODEL,
ISO_8859_1_PORTUGUESE_MODEL,
ISO_8859_15_PORTUGUESE_MODEL,
MACROMAN_PORTUGUESE_MODEL,
WINDOWS_1252_PORTUGUESE_MODEL,
)
from .langromanianmodel import (
CP852_ROMANIAN_MODEL,
ISO_8859_2_ROMANIAN_MODEL,
ISO_8859_16_ROMANIAN_MODEL,
MACLATIN2_ROMANIAN_MODEL,
WINDOWS_1250_ROMANIAN_MODEL,
)
from .langrussianmodel import (
CP855_RUSSIAN_MODEL,
CP866_RUSSIAN_MODEL,
ISO_8859_5_RUSSIAN_MODEL,
KOI8_R_RUSSIAN_MODEL,
MACCYRILLIC_RUSSIAN_MODEL,
WINDOWS_1251_RUSSIAN_MODEL,
)
from .langscottishgaelicmodel import (
CP037_SCOTTISH_GAELIC_MODEL,
CP500_SCOTTISH_GAELIC_MODEL,
ISO_8859_14_SCOTTISH_GAELIC_MODEL,
)
from .langserbianmodel import (
CP855_SERBIAN_MODEL,
ISO_8859_5_SERBIAN_MODEL,
MACCYRILLIC_SERBIAN_MODEL,
WINDOWS_1251_SERBIAN_MODEL,
)
from .langslovakmodel import (
CP852_SLOVAK_MODEL,
ISO_8859_2_SLOVAK_MODEL,
ISO_8859_16_SLOVAK_MODEL,
MACLATIN2_SLOVAK_MODEL,
WINDOWS_1250_SLOVAK_MODEL,
)
from .langslovenemodel import (
CP852_SLOVENE_MODEL,
ISO_8859_2_SLOVENE_MODEL,
ISO_8859_16_SLOVENE_MODEL,
MACLATIN2_SLOVENE_MODEL,
WINDOWS_1250_SLOVENE_MODEL,
)
from .langspanishmodel import (
CP037_SPANISH_MODEL,
CP500_SPANISH_MODEL,
CP850_SPANISH_MODEL,
CP858_SPANISH_MODEL,
ISO_8859_1_SPANISH_MODEL,
ISO_8859_15_SPANISH_MODEL,
MACROMAN_SPANISH_MODEL,
WINDOWS_1252_SPANISH_MODEL,
)
from .langswedishmodel import (
CP037_SWEDISH_MODEL,
CP500_SWEDISH_MODEL,
CP850_SWEDISH_MODEL,
CP858_SWEDISH_MODEL,
ISO_8859_1_SWEDISH_MODEL,
ISO_8859_15_SWEDISH_MODEL,
MACROMAN_SWEDISH_MODEL,
WINDOWS_1252_SWEDISH_MODEL,
)
from .langtajikmodel import KOI8_T_TAJIK_MODEL
from .langthaimodel import (
CP874_THAI_MODEL,
ISO_8859_11_THAI_MODEL,
TIS_620_THAI_MODEL,
)
from .langturkishmodel import (
CP857_TURKISH_MODEL,
CP1026_TURKISH_MODEL,
ISO_8859_3_TURKISH_MODEL,
ISO_8859_9_TURKISH_MODEL,
MACTURKISH_TURKISH_MODEL,
WINDOWS_1254_TURKISH_MODEL,
)
from .langukrainianmodel import (
CP1125_UKRAINIAN_MODEL,
ISO_8859_5_UKRAINIAN_MODEL,
KOI8_U_UKRAINIAN_MODEL,
MACCYRILLIC_UKRAINIAN_MODEL,
WINDOWS_1251_UKRAINIAN_MODEL,
)
from .langvietnamesemodel import WINDOWS_1258_VIETNAMESE_MODEL
from .langwelshmodel import (
CP037_WELSH_MODEL,
CP500_WELSH_MODEL,
ISO_8859_14_WELSH_MODEL,
)
from .sbcharsetprober import SingleByteCharSetProber
# Byte pattern detectors for single-byte encoding disambiguation
# Bytes in 0x80-0x9F range have different meanings in different encoding families:
# - Windows encodings: Smart quotes, dashes, currency symbols (printable punctuation)
# - Mac encodings: Accented letters and diacriticals (printable letters)
# - ISO-8859-x: Control characters (C1 control codes, mostly unprintable)
# Detect any byte in the Windows/Mac range
WIN_BYTE_DETECTOR = re.compile(b"[\x80-\x9f]")
# Detect Mac-only letter bytes for Latin encodings (letters in Mac, control/punct in Win/ISO)
MAC_LATIN_ONLY_LETTER_DETECTOR = re.compile(b"[\x81\x8d\x8f\x90\x92\x9d]")
# Detect MacCyrillic-only letter bytes (Cyrillic letters in Mac, punctuation in Windows-1251)
MAC_CYRILLIC_ONLY_LETTER_DETECTOR = re.compile(
b"[\x82\x84\x85\x86\x87\x88\x89\x8b\x91\x92\x93\x94\x95\x96\x97\x99\x9b]"
)
# Detect Mac letter bytes appearing between word characters (suggests Mac encoding)
MAC_LETTER_IN_WORD_DETECTOR = re.compile(b"[a-zA-Z][\x80-\x9f][a-zA-Z]")
# Detect Euro sign (0xA4 in ISO-8859-15, but generic currency ¤ in ISO-8859-1)
EURO_SIGN_DETECTOR = re.compile(b"\xa4")
# Latin encodings where Mac=letters and Windows=punctuation in 0x80-0x9F
CONFUSED_LATIN_ENCODINGS = frozenset({
"macroman",
"windows-1252",
"iso-8859-1",
"iso-8859-15",
"macgreek",
"windows-1253",
"iso-8859-7",
"macturkish",
"windows-1254",
"iso-8859-9",
"iso-8859-3", # Turkish/Maltese/Esperanto (also works with MacTurkish/Win-1254)
"maciceland",
"iso-8859-10",
"iso-8859-14",
})
# Central European encodings where Mac=letters and Windows=punctuation in 0x80-0x9F
CONFUSED_CENTRAL_EUROPEAN_ENCODINGS = frozenset({
"maclatin2",
"windows-1250",
"iso-8859-2",
"iso-8859-16", # Southeast European/Romanian (close to Latin-2)
})
# Cyrillic encodings where Mac=letters and Windows=punctuation in 0x80-0x9F
CONFUSED_CYRILLIC_ENCODINGS = frozenset({
"maccyrillic",
"windows-1251",
"iso-8859-5",
})
# Map ISO encodings to their Windows equivalents
ISO_WIN_MAP = {
"iso-8859-1": "Windows-1252",
"iso-8859-2": "Windows-1250",
"iso-8859-5": "Windows-1251",
"iso-8859-6": "Windows-1256",
"iso-8859-7": "Windows-1253",
"iso-8859-8": "Windows-1255",
"iso-8859-9": "Windows-1254",
"iso-8859-13": "Windows-1257",
}
[docs]
class SBCSGroupProber(CharSetGroupProber):
def __init__(
self,
lang_filter: LanguageFilter = LanguageFilter.ALL,
encoding_era: EncodingEra = EncodingEra.MODERN_WEB,
) -> None:
super().__init__(lang_filter=lang_filter, encoding_era=encoding_era)
# Initialize byte pattern tracking for disambiguation heuristics
self._has_win_bytes = False
self._has_mac_latin_letter_pattern = False
self._has_mac_cyrillic_letter_pattern = False
self._has_euro_sign = False
hebrew_prober = HebrewProber()
logical_hebrew_prober = SingleByteCharSetProber(
WINDOWS_1255_HEBREW_MODEL, is_reversed=False, name_prober=hebrew_prober
)
visual_hebrew_prober = SingleByteCharSetProber(
ISO_8859_8_HEBREW_MODEL, is_reversed=True, name_prober=hebrew_prober
)
hebrew_prober.set_model_probers(logical_hebrew_prober, visual_hebrew_prober)
# TODO: ORDER MATTERS HERE. I changed the order vs what was in master
# and several tests failed that did not before. Some thought
# should be put into the ordering, and we should consider making
# order not matter here, because that is very counter-intuitive.
self.probers = [
SingleByteCharSetProber(CP720_ARABIC_MODEL),
SingleByteCharSetProber(CP864_ARABIC_MODEL),
SingleByteCharSetProber(ISO_8859_6_ARABIC_MODEL),
SingleByteCharSetProber(WINDOWS_1256_ARABIC_MODEL),
SingleByteCharSetProber(CP866_BELARUSIAN_MODEL),
SingleByteCharSetProber(ISO_8859_5_BELARUSIAN_MODEL),
SingleByteCharSetProber(MACCYRILLIC_BELARUSIAN_MODEL),
SingleByteCharSetProber(WINDOWS_1251_BELARUSIAN_MODEL),
SingleByteCharSetProber(ISO_8859_14_BRETON_MODEL),
SingleByteCharSetProber(CP037_BRETON_MODEL),
SingleByteCharSetProber(CP500_BRETON_MODEL),
SingleByteCharSetProber(CP855_BULGARIAN_MODEL),
SingleByteCharSetProber(ISO_8859_5_BULGARIAN_MODEL),
SingleByteCharSetProber(MACCYRILLIC_BULGARIAN_MODEL),
SingleByteCharSetProber(WINDOWS_1251_BULGARIAN_MODEL),
SingleByteCharSetProber(CP852_CROATIAN_MODEL),
SingleByteCharSetProber(ISO_8859_16_CROATIAN_MODEL),
SingleByteCharSetProber(ISO_8859_2_CROATIAN_MODEL),
SingleByteCharSetProber(MACLATIN2_CROATIAN_MODEL),
SingleByteCharSetProber(WINDOWS_1250_CROATIAN_MODEL),
SingleByteCharSetProber(ISO_8859_2_CZECH_MODEL),
SingleByteCharSetProber(WINDOWS_1250_CZECH_MODEL),
SingleByteCharSetProber(CP037_DANISH_MODEL),
SingleByteCharSetProber(CP500_DANISH_MODEL),
SingleByteCharSetProber(CP850_DANISH_MODEL),
SingleByteCharSetProber(CP858_DANISH_MODEL),
SingleByteCharSetProber(CP865_DANISH_MODEL),
SingleByteCharSetProber(ISO_8859_15_DANISH_MODEL),
SingleByteCharSetProber(ISO_8859_1_DANISH_MODEL),
SingleByteCharSetProber(MACROMAN_DANISH_MODEL),
SingleByteCharSetProber(WINDOWS_1252_DANISH_MODEL),
SingleByteCharSetProber(CP037_DUTCH_MODEL),
SingleByteCharSetProber(CP500_DUTCH_MODEL),
SingleByteCharSetProber(CP850_DUTCH_MODEL),
SingleByteCharSetProber(CP858_DUTCH_MODEL),
SingleByteCharSetProber(ISO_8859_15_DUTCH_MODEL),
SingleByteCharSetProber(ISO_8859_1_DUTCH_MODEL),
SingleByteCharSetProber(MACROMAN_DUTCH_MODEL),
SingleByteCharSetProber(WINDOWS_1252_DUTCH_MODEL),
SingleByteCharSetProber(CP037_ENGLISH_MODEL),
SingleByteCharSetProber(CP437_ENGLISH_MODEL),
SingleByteCharSetProber(CP500_ENGLISH_MODEL),
SingleByteCharSetProber(CP850_ENGLISH_MODEL),
SingleByteCharSetProber(CP858_ENGLISH_MODEL),
SingleByteCharSetProber(ISO_8859_15_ENGLISH_MODEL),
SingleByteCharSetProber(ISO_8859_1_ENGLISH_MODEL),
SingleByteCharSetProber(MACROMAN_ENGLISH_MODEL),
SingleByteCharSetProber(WINDOWS_1252_ENGLISH_MODEL),
SingleByteCharSetProber(ISO_8859_3_ESPERANTO_MODEL),
SingleByteCharSetProber(CP775_ESTONIAN_MODEL),
SingleByteCharSetProber(ISO_8859_13_ESTONIAN_MODEL),
SingleByteCharSetProber(ISO_8859_4_ESTONIAN_MODEL),
SingleByteCharSetProber(WINDOWS_1257_ESTONIAN_MODEL),
SingleByteCharSetProber(ISO_8859_6_FARSI_MODEL),
SingleByteCharSetProber(WINDOWS_1256_FARSI_MODEL),
SingleByteCharSetProber(CP037_FINNISH_MODEL),
SingleByteCharSetProber(CP500_FINNISH_MODEL),
SingleByteCharSetProber(CP850_FINNISH_MODEL),
SingleByteCharSetProber(CP858_FINNISH_MODEL),
SingleByteCharSetProber(ISO_8859_15_FINNISH_MODEL),
SingleByteCharSetProber(ISO_8859_1_FINNISH_MODEL),
SingleByteCharSetProber(MACROMAN_FINNISH_MODEL),
SingleByteCharSetProber(WINDOWS_1252_FINNISH_MODEL),
SingleByteCharSetProber(CP037_FRENCH_MODEL),
SingleByteCharSetProber(CP500_FRENCH_MODEL),
SingleByteCharSetProber(CP850_FRENCH_MODEL),
SingleByteCharSetProber(CP858_FRENCH_MODEL),
SingleByteCharSetProber(CP863_FRENCH_MODEL),
SingleByteCharSetProber(ISO_8859_15_FRENCH_MODEL),
SingleByteCharSetProber(ISO_8859_1_FRENCH_MODEL),
SingleByteCharSetProber(MACROMAN_FRENCH_MODEL),
SingleByteCharSetProber(WINDOWS_1252_FRENCH_MODEL),
SingleByteCharSetProber(CP037_GERMAN_MODEL),
SingleByteCharSetProber(CP500_GERMAN_MODEL),
SingleByteCharSetProber(CP850_GERMAN_MODEL),
SingleByteCharSetProber(CP858_GERMAN_MODEL),
SingleByteCharSetProber(ISO_8859_15_GERMAN_MODEL),
SingleByteCharSetProber(ISO_8859_1_GERMAN_MODEL),
SingleByteCharSetProber(MACROMAN_GERMAN_MODEL),
SingleByteCharSetProber(WINDOWS_1252_GERMAN_MODEL),
SingleByteCharSetProber(CP737_GREEK_MODEL),
SingleByteCharSetProber(CP869_GREEK_MODEL),
SingleByteCharSetProber(CP875_GREEK_MODEL),
SingleByteCharSetProber(ISO_8859_7_GREEK_MODEL),
SingleByteCharSetProber(MACGREEK_GREEK_MODEL),
SingleByteCharSetProber(WINDOWS_1253_GREEK_MODEL),
SingleByteCharSetProber(CP424_HEBREW_MODEL, is_reversed=True),
SingleByteCharSetProber(CP856_HEBREW_MODEL, is_reversed=True),
SingleByteCharSetProber(CP862_HEBREW_MODEL, is_reversed=True),
hebrew_prober,
logical_hebrew_prober,
visual_hebrew_prober,
SingleByteCharSetProber(CP852_HUNGARIAN_MODEL),
SingleByteCharSetProber(ISO_8859_16_HUNGARIAN_MODEL),
SingleByteCharSetProber(ISO_8859_2_HUNGARIAN_MODEL),
SingleByteCharSetProber(MACLATIN2_HUNGARIAN_MODEL),
SingleByteCharSetProber(WINDOWS_1250_HUNGARIAN_MODEL),
SingleByteCharSetProber(CP037_ICELANDIC_MODEL),
SingleByteCharSetProber(CP500_ICELANDIC_MODEL),
SingleByteCharSetProber(CP861_ICELANDIC_MODEL),
SingleByteCharSetProber(ISO_8859_10_ICELANDIC_MODEL),
SingleByteCharSetProber(ISO_8859_1_ICELANDIC_MODEL),
SingleByteCharSetProber(MACICELAND_ICELANDIC_MODEL),
SingleByteCharSetProber(CP037_INDONESIAN_MODEL),
SingleByteCharSetProber(CP500_INDONESIAN_MODEL),
SingleByteCharSetProber(ISO_8859_1_INDONESIAN_MODEL),
SingleByteCharSetProber(MACROMAN_INDONESIAN_MODEL),
SingleByteCharSetProber(WINDOWS_1252_INDONESIAN_MODEL),
SingleByteCharSetProber(ISO_8859_14_IRISH_MODEL),
SingleByteCharSetProber(CP037_IRISH_MODEL),
SingleByteCharSetProber(CP500_IRISH_MODEL),
SingleByteCharSetProber(CP037_ITALIAN_MODEL),
SingleByteCharSetProber(CP500_ITALIAN_MODEL),
SingleByteCharSetProber(CP850_ITALIAN_MODEL),
SingleByteCharSetProber(CP858_ITALIAN_MODEL),
SingleByteCharSetProber(ISO_8859_15_ITALIAN_MODEL),
SingleByteCharSetProber(ISO_8859_1_ITALIAN_MODEL),
SingleByteCharSetProber(MACROMAN_ITALIAN_MODEL),
SingleByteCharSetProber(WINDOWS_1252_ITALIAN_MODEL),
SingleByteCharSetProber(KZ1048_KAZAKH_MODEL),
SingleByteCharSetProber(PTCP154_KAZAKH_MODEL),
SingleByteCharSetProber(CP775_LATVIAN_MODEL),
SingleByteCharSetProber(ISO_8859_13_LATVIAN_MODEL),
SingleByteCharSetProber(ISO_8859_4_LATVIAN_MODEL),
SingleByteCharSetProber(WINDOWS_1257_LATVIAN_MODEL),
SingleByteCharSetProber(CP775_LITHUANIAN_MODEL),
SingleByteCharSetProber(ISO_8859_13_LITHUANIAN_MODEL),
SingleByteCharSetProber(ISO_8859_4_LITHUANIAN_MODEL),
SingleByteCharSetProber(WINDOWS_1257_LITHUANIAN_MODEL),
SingleByteCharSetProber(CP855_MACEDONIAN_MODEL),
SingleByteCharSetProber(ISO_8859_5_MACEDONIAN_MODEL),
SingleByteCharSetProber(MACCYRILLIC_MACEDONIAN_MODEL),
SingleByteCharSetProber(WINDOWS_1251_MACEDONIAN_MODEL),
SingleByteCharSetProber(CP037_MALAY_MODEL),
SingleByteCharSetProber(CP500_MALAY_MODEL),
SingleByteCharSetProber(ISO_8859_1_MALAY_MODEL),
SingleByteCharSetProber(MACROMAN_MALAY_MODEL),
SingleByteCharSetProber(WINDOWS_1252_MALAY_MODEL),
SingleByteCharSetProber(ISO_8859_3_MALTESE_MODEL),
SingleByteCharSetProber(CP037_NORWEGIAN_MODEL),
SingleByteCharSetProber(CP500_NORWEGIAN_MODEL),
SingleByteCharSetProber(CP850_NORWEGIAN_MODEL),
SingleByteCharSetProber(CP858_NORWEGIAN_MODEL),
SingleByteCharSetProber(CP865_NORWEGIAN_MODEL),
SingleByteCharSetProber(ISO_8859_15_NORWEGIAN_MODEL),
SingleByteCharSetProber(ISO_8859_1_NORWEGIAN_MODEL),
SingleByteCharSetProber(MACROMAN_NORWEGIAN_MODEL),
SingleByteCharSetProber(WINDOWS_1252_NORWEGIAN_MODEL),
SingleByteCharSetProber(CP852_POLISH_MODEL),
SingleByteCharSetProber(ISO_8859_16_POLISH_MODEL),
SingleByteCharSetProber(ISO_8859_2_POLISH_MODEL),
SingleByteCharSetProber(MACLATIN2_POLISH_MODEL),
SingleByteCharSetProber(WINDOWS_1250_POLISH_MODEL),
SingleByteCharSetProber(CP037_PORTUGUESE_MODEL),
SingleByteCharSetProber(CP500_PORTUGUESE_MODEL),
SingleByteCharSetProber(CP850_PORTUGUESE_MODEL),
SingleByteCharSetProber(CP858_PORTUGUESE_MODEL),
SingleByteCharSetProber(CP860_PORTUGUESE_MODEL),
SingleByteCharSetProber(ISO_8859_15_PORTUGUESE_MODEL),
SingleByteCharSetProber(ISO_8859_1_PORTUGUESE_MODEL),
SingleByteCharSetProber(MACROMAN_PORTUGUESE_MODEL),
SingleByteCharSetProber(WINDOWS_1252_PORTUGUESE_MODEL),
SingleByteCharSetProber(CP852_ROMANIAN_MODEL),
SingleByteCharSetProber(ISO_8859_16_ROMANIAN_MODEL),
SingleByteCharSetProber(ISO_8859_2_ROMANIAN_MODEL),
SingleByteCharSetProber(MACLATIN2_ROMANIAN_MODEL),
SingleByteCharSetProber(WINDOWS_1250_ROMANIAN_MODEL),
SingleByteCharSetProber(CP855_RUSSIAN_MODEL),
SingleByteCharSetProber(CP866_RUSSIAN_MODEL),
SingleByteCharSetProber(ISO_8859_5_RUSSIAN_MODEL),
SingleByteCharSetProber(KOI8_R_RUSSIAN_MODEL),
SingleByteCharSetProber(MACCYRILLIC_RUSSIAN_MODEL),
SingleByteCharSetProber(WINDOWS_1251_RUSSIAN_MODEL),
SingleByteCharSetProber(CP855_SERBIAN_MODEL),
SingleByteCharSetProber(ISO_8859_5_SERBIAN_MODEL),
SingleByteCharSetProber(MACCYRILLIC_SERBIAN_MODEL),
SingleByteCharSetProber(WINDOWS_1251_SERBIAN_MODEL),
SingleByteCharSetProber(ISO_8859_14_SCOTTISH_GAELIC_MODEL),
SingleByteCharSetProber(CP037_SCOTTISH_GAELIC_MODEL),
SingleByteCharSetProber(CP500_SCOTTISH_GAELIC_MODEL),
SingleByteCharSetProber(CP852_SLOVAK_MODEL),
SingleByteCharSetProber(ISO_8859_16_SLOVAK_MODEL),
SingleByteCharSetProber(ISO_8859_2_SLOVAK_MODEL),
SingleByteCharSetProber(MACLATIN2_SLOVAK_MODEL),
SingleByteCharSetProber(WINDOWS_1250_SLOVAK_MODEL),
SingleByteCharSetProber(CP852_SLOVENE_MODEL),
SingleByteCharSetProber(ISO_8859_16_SLOVENE_MODEL),
SingleByteCharSetProber(ISO_8859_2_SLOVENE_MODEL),
SingleByteCharSetProber(MACLATIN2_SLOVENE_MODEL),
SingleByteCharSetProber(WINDOWS_1250_SLOVENE_MODEL),
SingleByteCharSetProber(CP037_SPANISH_MODEL),
SingleByteCharSetProber(CP500_SPANISH_MODEL),
SingleByteCharSetProber(CP850_SPANISH_MODEL),
SingleByteCharSetProber(CP858_SPANISH_MODEL),
SingleByteCharSetProber(ISO_8859_15_SPANISH_MODEL),
SingleByteCharSetProber(ISO_8859_1_SPANISH_MODEL),
SingleByteCharSetProber(MACROMAN_SPANISH_MODEL),
SingleByteCharSetProber(WINDOWS_1252_SPANISH_MODEL),
SingleByteCharSetProber(CP037_SWEDISH_MODEL),
SingleByteCharSetProber(CP500_SWEDISH_MODEL),
SingleByteCharSetProber(CP850_SWEDISH_MODEL),
SingleByteCharSetProber(CP858_SWEDISH_MODEL),
SingleByteCharSetProber(ISO_8859_15_SWEDISH_MODEL),
SingleByteCharSetProber(ISO_8859_1_SWEDISH_MODEL),
SingleByteCharSetProber(MACROMAN_SWEDISH_MODEL),
SingleByteCharSetProber(WINDOWS_1252_SWEDISH_MODEL),
SingleByteCharSetProber(KOI8_T_TAJIK_MODEL),
SingleByteCharSetProber(CP874_THAI_MODEL),
SingleByteCharSetProber(ISO_8859_11_THAI_MODEL),
SingleByteCharSetProber(TIS_620_THAI_MODEL),
SingleByteCharSetProber(CP1026_TURKISH_MODEL),
SingleByteCharSetProber(CP857_TURKISH_MODEL),
SingleByteCharSetProber(ISO_8859_3_TURKISH_MODEL),
SingleByteCharSetProber(ISO_8859_9_TURKISH_MODEL),
SingleByteCharSetProber(MACTURKISH_TURKISH_MODEL),
SingleByteCharSetProber(WINDOWS_1254_TURKISH_MODEL),
SingleByteCharSetProber(CP1125_UKRAINIAN_MODEL),
SingleByteCharSetProber(ISO_8859_5_UKRAINIAN_MODEL),
SingleByteCharSetProber(KOI8_U_UKRAINIAN_MODEL),
SingleByteCharSetProber(MACCYRILLIC_UKRAINIAN_MODEL),
SingleByteCharSetProber(WINDOWS_1251_UKRAINIAN_MODEL),
SingleByteCharSetProber(ISO_8859_14_WELSH_MODEL),
SingleByteCharSetProber(CP037_WELSH_MODEL),
SingleByteCharSetProber(CP500_WELSH_MODEL),
SingleByteCharSetProber(WINDOWS_1258_VIETNAMESE_MODEL),
]
# Filter probers based on encoding era and language
self.probers = self._filter_probers(self.probers)
self.reset()
[docs]
def reset(self) -> None:
super().reset()
self._has_win_bytes = False
self._has_mac_latin_letter_pattern = False
self._has_mac_cyrillic_letter_pattern = False
self._has_euro_sign = False
[docs]
def feed(self, byte_str: Union[bytes, bytearray]) -> "ProbingState":
# Detect byte patterns (only check new bytes for efficiency)
if WIN_BYTE_DETECTOR.search(byte_str):
self._has_win_bytes = True
if MAC_LETTER_IN_WORD_DETECTOR.search(
byte_str
) or MAC_LATIN_ONLY_LETTER_DETECTOR.search(byte_str):
self._has_mac_latin_letter_pattern = True
if MAC_CYRILLIC_ONLY_LETTER_DETECTOR.search(byte_str):
self._has_mac_cyrillic_letter_pattern = True
if EURO_SIGN_DETECTOR.search(byte_str):
self._has_euro_sign = True
# Call parent feed method
return super().feed(byte_str)
[docs]
def get_confidence(self) -> float:
# Get base confidence from parent
base_confidence = super().get_confidence()
# If no best prober yet, return base confidence
if not self._best_guess_prober:
return base_confidence
# Apply heuristics to disambiguate confused encodings
charset_name = self._best_guess_prober.charset_name
if not charset_name:
return base_confidence
confidence = base_confidence
lower_charset_name = charset_name.lower()
# Build alternatives dict: best prober for each charset (excluding winner)
alternatives = {}
for prober in self.probers:
if not prober.active or prober == self._best_guess_prober:
continue
alt_name = (prober.charset_name or "").lower()
alt_conf = prober.get_confidence()
if alt_name not in alternatives or alt_conf > alternatives[alt_name][1]:
alternatives[alt_name] = (prober, alt_conf)
# Heuristic 1: Mac/Windows/ISO disambiguation for LATIN encodings
is_latin_family = lower_charset_name in CONFUSED_LATIN_ENCODINGS
if is_latin_family and lower_charset_name == "macroman":
# MacRoman wins but no Mac patterns → prefer ISO/Windows
# If we have Win bytes, prefer Windows encodings specifically
if not self._has_mac_latin_letter_pattern:
alt_names = (
("windows-1252", "iso-8859-1", "iso-8859-15")
if self._has_win_bytes
else ("iso-8859-1", "windows-1252", "iso-8859-15")
)
for alt_name in alt_names:
if alt_name in alternatives:
prober, alt_conf = alternatives[alt_name]
if alt_conf >= confidence * 0.995: # Within 0.5%
self._best_guess_prober = prober
return alt_conf
# Cross-family Mac vs Windows disambiguation
# If ANY Mac encoding wins but we have Windows bytes and no Mac patterns,
# prefer any close Windows alternative (even from different language family)
# This handles cases where MacRoman/MacLatin2/etc wins against text in a different family
if (
lower_charset_name.startswith("mac")
and self._has_win_bytes
and not self._has_mac_latin_letter_pattern
):
# Look for Windows alternatives
win_alternatives = [
(name, prober, conf)
for name, (prober, conf) in alternatives.items()
if name.startswith("windows-")
]
if win_alternatives:
# Sort by confidence and take the best Windows alternative
win_alternatives.sort(key=lambda x: -x[2])
best_win_name, best_win_prober, best_win_conf = win_alternatives[0]
if best_win_conf >= confidence * 0.995: # Within 0.5%
self._best_guess_prober = best_win_prober
return best_win_conf
elif lower_charset_name.startswith("iso-8859"):
is_latin_iso = lower_charset_name in CONFUSED_LATIN_ENCODINGS
# ISO wins and has Windows bytes → switch to Windows
if self._has_win_bytes:
should_switch = True
# But check if Mac is close with Mac patterns (Latin only)
if is_latin_iso and self._has_mac_latin_letter_pattern:
for mac_name in alternatives:
if (
mac_name.startswith("mac")
and mac_name in CONFUSED_LATIN_ENCODINGS
):
_, mac_conf = alternatives[mac_name]
if mac_conf >= confidence * 0.995:
should_switch = False
break
if should_switch:
win_name = ISO_WIN_MAP.get(lower_charset_name)
if win_name and win_name.lower() in alternatives:
prober, alt_conf = alternatives[win_name.lower()]
self._best_guess_prober = prober
return alt_conf
# ISO-8859-1 with Euro sign → prefer ISO-8859-15
if lower_charset_name == "iso-8859-1" and self._has_euro_sign:
if "iso-8859-15" in alternatives:
prober, alt_conf = alternatives["iso-8859-15"]
if alt_conf >= confidence * 0.99:
self._best_guess_prober = prober
return alt_conf
# Heuristic 2: Euro sign detection for Latin encodings
if self._has_euro_sign and "iso-8859-15" in alternatives:
is_latin_encoding = lower_charset_name in CONFUSED_LATIN_ENCODINGS
if is_latin_encoding:
prober, alt_conf = alternatives["iso-8859-15"]
if alt_conf >= confidence * 0.99:
self._best_guess_prober = prober
return alt_conf
# Heuristic 3: Prefer Mac over Windows/ISO when Mac Latin letter patterns present
if self._has_mac_latin_letter_pattern:
mac_alternatives = [
name
for name in alternatives
if name.startswith("mac") and name in CONFUSED_LATIN_ENCODINGS
]
for mac_name in mac_alternatives:
prober, mac_conf = alternatives[mac_name]
is_latin_win_or_iso = (
lower_charset_name in CONFUSED_LATIN_ENCODINGS
and not lower_charset_name.startswith("mac")
)
if is_latin_win_or_iso and mac_conf >= confidence * 0.90:
self._best_guess_prober = prober
return mac_conf
# Heuristic 4: Mac/Windows/ISO disambiguation for CYRILLIC encodings
is_cyrillic_family = lower_charset_name in CONFUSED_CYRILLIC_ENCODINGS
if is_cyrillic_family and lower_charset_name == "maccyrillic":
# MacCyrillic wins but no Mac Cyrillic patterns → prefer Windows/ISO
if not self._has_mac_cyrillic_letter_pattern and not self._has_win_bytes:
for alt_name in ("windows-1251", "iso-8859-5"):
if alt_name in alternatives:
prober, alt_conf = alternatives[alt_name]
if alt_conf >= confidence * 0.995:
self._best_guess_prober = prober
return alt_conf
elif is_cyrillic_family and lower_charset_name == "iso-8859-5":
# ISO-8859-5 wins and has Windows bytes → switch to Windows-1251
if self._has_win_bytes:
should_switch = True
if (
self._has_mac_cyrillic_letter_pattern
and "maccyrillic" in alternatives
):
_, mac_conf = alternatives["maccyrillic"]
if mac_conf >= confidence * 0.995:
should_switch = False
if should_switch and "windows-1251" in alternatives:
prober, alt_conf = alternatives["windows-1251"]
self._best_guess_prober = prober
return alt_conf
# Heuristic 5: Prefer MacCyrillic when Mac Cyrillic letter patterns present
if self._has_mac_cyrillic_letter_pattern and "maccyrillic" in alternatives:
prober, mac_conf = alternatives["maccyrillic"]
is_cyrillic_win_or_iso = (
lower_charset_name in CONFUSED_CYRILLIC_ENCODINGS
and lower_charset_name != "maccyrillic"
)
if is_cyrillic_win_or_iso and mac_conf >= confidence * 0.90:
self._best_guess_prober = prober
return mac_conf
# Heuristic 6: Mac/Windows/ISO disambiguation for CENTRAL EUROPEAN encodings
is_central_european_family = (
lower_charset_name in CONFUSED_CENTRAL_EUROPEAN_ENCODINGS
)
if is_central_european_family and lower_charset_name == "maclatin2":
# MacLatin2 wins but no Mac patterns → prefer Windows/ISO
if not self._has_mac_latin_letter_pattern:
alt_names = (
("windows-1250", "iso-8859-2")
if self._has_win_bytes
else ("iso-8859-2", "windows-1250")
)
for alt_name in alt_names:
if alt_name in alternatives:
prober, alt_conf = alternatives[alt_name]
if alt_conf >= confidence * 0.995:
self._best_guess_prober = prober
return alt_conf
elif is_central_european_family and lower_charset_name == "iso-8859-2":
# ISO-8859-2 wins and has Windows bytes → switch to Windows-1250
if self._has_win_bytes:
should_switch = True
if self._has_mac_latin_letter_pattern and "maclatin2" in alternatives:
_, mac_conf = alternatives["maclatin2"]
if mac_conf >= confidence * 0.995:
should_switch = False
if should_switch and "windows-1250" in alternatives:
prober, alt_conf = alternatives["windows-1250"]
self._best_guess_prober = prober
return alt_conf
# Heuristic 7: Prefer MacLatin2 when Mac Latin letter patterns present
if self._has_mac_latin_letter_pattern and "maclatin2" in alternatives:
prober, mac_conf = alternatives["maclatin2"]
is_central_european_win_or_iso = (
lower_charset_name in CONFUSED_CENTRAL_EUROPEAN_ENCODINGS
and lower_charset_name != "maclatin2"
)
if is_central_european_win_or_iso and mac_conf >= confidence * 0.90:
self._best_guess_prober = prober
return mac_conf
return confidence