Source code for chardet.metadata.languages

"""
Metadata about languages used by our model training code for our
SingleByteCharSetProbers.  Could be used for other things in the future.

This code was originally based on the language metadata from the uchardet
project.
"""

from dataclasses import dataclass
from string import ascii_letters


[docs] @dataclass(frozen=True) class Language: """Metadata about a language useful for training models :ivar name: The human name for the language, in English. :type name: str :ivar iso_code: 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise, or use another catalog as a last resort. :type iso_code: str :ivar use_ascii: Whether or not ASCII letters should be included in trained models. :type use_ascii: bool :ivar charsets: The charsets we want to support and create data for. :type charsets: list of str :ivar alphabet: The characters in the language's alphabet. If `use_ascii` is `True`, you only need to add those not in the ASCII set. :type alphabet: str :ivar num_training_docs: Number of documents from CulturaX to use for training. This represents approximately 300M characters of training data. None means the count hasn't been determined yet. :type num_training_docs: int or None :ivar num_training_chars: Number of characters from CulturaX used for training. The goal is for this to be at least 300M characters, but some languages may not have that much data available. None means the count hasn't been determined yet. :type num_training_chars: int or None """ name: str iso_code: str use_ascii: bool charsets: list[str] alphabet: str num_training_docs: int | None = None num_training_chars: int | None = None def __repr__(self) -> str: param_str = ", ".join( f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_") ) return f"{self.__class__.__name__}({param_str})"
LANGUAGES = { "Arabic": Language( name="Arabic", iso_code="ar", use_ascii=False, charsets=[ "CP720", "CP864", "ISO-8859-6", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1256", ], alphabet="ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ", num_training_docs=98_634, num_training_chars=300_001_734, ), "Belarusian": Language( name="Belarusian", iso_code="be", use_ascii=False, charsets=[ "CP855", "CP866", "ISO-8859-5", "KOI8-R", "MacCyrillic", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1251", ], alphabet="АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯабвгдеёжзійклмнопрстуўфхцчшыьэюяʼ", num_training_docs=77_015, num_training_chars=300_003_525, ), "Bulgarian": Language( name="Bulgarian", iso_code="bg", use_ascii=False, charsets=[ "CP855", "CP866", "ISO-8859-5", "KOI8-R", "MacCyrillic", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1251", ], alphabet="АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя", num_training_docs=84_186, num_training_chars=300_001_428, ), "Breton": Language( name="Breton", iso_code="br", use_ascii=True, charsets=[ "CP037", "CP437", "CP500", "CP850", "CP858", "ISO-8859-1", "ISO-8859-14", "ISO-8859-15", "MacRoman", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1252", ], alphabet="".join(sorted(set(ascii_letters + "ÀÂÈÊÎÔÙÛàâèêîôùû"))), num_training_docs=43_761, num_training_chars=39_597_819, ), "Welsh": Language( name="Welsh", iso_code="cy", use_ascii=True, charsets=[ "CP037", "CP437", "CP500", "CP850", "CP858", "ISO-8859-1", "ISO-8859-14", "ISO-8859-15", "MacRoman", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1252", ], # ẀẂẄ never seem to occur in training data, so don't include them alphabet="".join( sorted(set(ascii_letters + "ÁÂÄÉÊËÍÎÏÓÔÖÚÛÜÝáâäéêëíîïóôöúûüýÿŴŵŶŷŸẁẃẅỲỳ")) ), num_training_docs=78_726, num_training_chars=300_014_737, ), "Czech": Language( name="Czech", iso_code="cs", use_ascii=True, charsets=[ "CP852", "ISO-8859-2", "ISO-8859-16", "MacLatin2", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1250", ], alphabet="".join(sorted(set(ascii_letters + "áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ"))), num_training_docs=98_820, num_training_chars=300_005_076, ), "Danish": Language( name="Danish", iso_code="da", use_ascii=True, charsets=[ "CP037", "CP437", "CP500", "CP850", "CP858", "CP865", "ISO-8859-1", "ISO-8859-15", "MacRoman", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1252", ], alphabet="".join(sorted(set(ascii_letters + "æøåÆØÅ"))), num_training_docs=83_466, num_training_chars=300_004_040, ), "German": Language( name="German", iso_code="de", use_ascii=True, charsets=[ "CP037", "CP437", "CP500", "CP850", "CP858", "ISO-8859-1", "ISO-8859-15", "MacRoman", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1252", ], alphabet="".join(sorted(set(ascii_letters + "äöüßẞÄÖÜ"))), num_training_docs=83_754, num_training_chars=300_002_572, ), "Greek": Language( name="Greek", iso_code="el", use_ascii=False, charsets=[ "CP737", "CP869", "CP875", "ISO-8859-7", "MacGreek", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1253", ], alphabet="αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ", num_training_docs=103_810, num_training_chars=300_003_051, ), "English": Language( name="English", iso_code="en", alphabet=ascii_letters, use_ascii=True, charsets=[ "CP037", "CP437", "CP500", "CP850", "CP858", "ISO-8859-1", "ISO-8859-15", "MacRoman", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1252", ], num_training_docs=84_511, num_training_chars=300_004_447, ), "Esperanto": Language( name="Esperanto", iso_code="eo", use_ascii=True, charsets=[ "ISO-8859-3", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", ], # Include Q, W, X, Y for loanwords alphabet="".join(sorted(set(ascii_letters + "ĉĝĥĵŝŭĈĜĤĴŜŬ"))), num_training_docs=40_441, num_training_chars=300_001_893, ), "Spanish": Language( name="Spanish", iso_code="es", use_ascii=True, charsets=[ "CP037", "CP437", "CP500", "CP850", "CP858", "ISO-8859-1", "ISO-8859-15", "MacRoman", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1252", ], alphabet="".join(sorted(set(ascii_letters + "ñáéíóúüÑÁÉÍÓÚÜ"))), num_training_docs=87_069, num_training_chars=300_000_884, ), "Estonian": Language( name="Estonian", iso_code="et", use_ascii=True, charsets=[ "CP775", "ISO-8859-4", "ISO-8859-13", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1257", ], # Include C, F, Š, Q, W, X, Y, Z, Ž for loanwords alphabet="".join(sorted(set(ascii_letters + "õäöüšžÕÄÖÜŠŽ"))), num_training_docs=66_818, num_training_chars=300_000_765, ), "Farsi": Language( name="Farsi", iso_code="fa", use_ascii=False, charsets=[ "CP720", "CP864", "ISO-8859-6", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1256", ], alphabet="ءآأؤإئابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهیًٌٍَُِّ", num_training_docs=104_835, num_training_chars=300_001_684, ), "Finnish": Language( name="Finnish", iso_code="fi", use_ascii=True, charsets=[ "CP037", "CP437", "CP500", "CP850", "CP858", "ISO-8859-1", "ISO-8859-15", "MacRoman", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1252", ], alphabet="".join(sorted(set(ascii_letters + "ÅÄÖŠŽåäöšž"))), num_training_docs=80_361, num_training_chars=300_001_375, ), "French": Language( name="French", iso_code="fr", use_ascii=True, charsets=[ "CP037", "CP437", "CP500", "CP850", "CP858", "CP863", "ISO-8859-1", "ISO-8859-15", "MacRoman", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1252", ], alphabet="".join( sorted(set(ascii_letters + "àâæçèéêëîïôùûüÿÀÂÆÇÈÉÊËÎÏÔÙÛÜŸŒœ")) ), num_training_docs=88_369, num_training_chars=300_003_422, ), "Irish": Language( name="Irish", iso_code="ga", use_ascii=True, charsets=[ "CP037", "CP437", "CP500", "CP850", "CP858", "ISO-8859-1", "ISO-8859-14", "ISO-8859-15", "MacRoman", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1252", ], alphabet="".join(sorted(set(ascii_letters + "ÁÉÍÓÚáéíóú"))), num_training_docs=63_468, num_training_chars=300_005_169, ), "Scottish Gaelic": Language( name="Scottish Gaelic", iso_code="gd", use_ascii=True, charsets=[ "CP037", "CP437", "CP500", "CP850", "CP858", "ISO-8859-1", "ISO-8859-14", "ISO-8859-15", "MacRoman", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1252", ], alphabet="".join(sorted(set(ascii_letters + "ÀÈÌÒÙàèìòù"))), num_training_docs=8_408, num_training_chars=15_116_797, ), "Hebrew": Language( name="Hebrew", iso_code="he", use_ascii=False, charsets=[ "CP424", "CP856", "CP862", "ISO-8859-8", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1255", ], alphabet="אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ", num_training_docs=58265, num_training_chars=300_009_414, ), "Croatian": Language( name="Croatian", iso_code="hr", use_ascii=True, charsets=[ "CP852", "ISO-8859-2", "ISO-8859-16", "MacLatin2", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1250", ], # Include Q, W, X, Y for loanwords alphabet="".join(sorted(set(ascii_letters + "čćđšžČĆĐŠŽ"))), num_training_docs=460_689, num_training_chars=157_579_665, ), "Hungarian": Language( name="Hungarian", iso_code="hu", use_ascii=True, charsets=[ "CP852", "ISO-8859-2", "ISO-8859-16", "MacLatin2", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1250", ], # Include Q, W, X, Y for loanwords alphabet="".join(sorted(set(ascii_letters + "áéíóöőúüűÁÉÍÓÖŐÚÜŰ"))), num_training_docs=82_417, num_training_chars=300_001_846, ), "Icelandic": Language( name="Icelandic", iso_code="is", use_ascii=True, charsets=[ "CP037", "CP437", "CP500", "CP850", "CP858", "CP861", "ISO-8859-1", "ISO-8859-10", "ISO-8859-15", "MacIceland", "MacRoman", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1252", ], # Include Q, W for loanwords alphabet="".join(sorted(set(ascii_letters + "áðéíóúýþæöÁÐÉÍÓÚÝÞÆÖ"))), num_training_docs=77_487, num_training_chars=300_004_354, ), "Indonesian": Language( name="Indonesian", iso_code="id", use_ascii=True, charsets=[ "CP037", "CP437", "CP500", "CP850", "CP858", "ISO-8859-1", "ISO-8859-15", "MacRoman", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1252", ], alphabet=ascii_letters, num_training_docs=114_888, num_training_chars=300_000_301, ), "Italian": Language( name="Italian", iso_code="it", use_ascii=True, charsets=[ "CP037", "CP437", "CP500", "CP850", "CP858", "ISO-8859-1", "ISO-8859-15", "MacRoman", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1252", ], alphabet="".join(sorted(set(ascii_letters + "ÀÈÉÌÒÓÙàèéìòóù"))), num_training_docs=92_388, num_training_chars=300_081_924, ), "Japanese": Language( name="Japanese", iso_code="ja", use_ascii=False, charsets=[ "CP932", "EUC-JP", "ISO-2022-JP", "SHIFT-JIS", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", ], alphabet="", num_training_docs=None, num_training_chars=None, ), "Kazakh": Language( name="Kazakh", iso_code="kk", use_ascii=False, charsets=[ "KZ1048", "PTCP154", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", ], alphabet="АӘБВГҒДЕЁЖЗИЙКҚЛМНҢОӨПРСТУҰҮФХҺЦЧШЩЪЫІЬЭЮЯаәбвгғдеёжзийкқлмнңоөпрстууұүфхһцчшщъыіьэюя", num_training_docs=73_160, num_training_chars=300_002_618, ), "Korean": Language( name="Korean", iso_code="ko", use_ascii=False, charsets=[ "CP949", "EUC-KR", "ISO-2022-KR", "JOHAB", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", ], alphabet="", num_training_docs=None, num_training_chars=None, ), "Cornish": Language( name="Cornish", iso_code="kw", use_ascii=True, charsets=[ "CP037", "CP437", "CP500", "CP850", "CP858", "ISO-8859-1", "ISO-8859-14", "ISO-8859-15", "MacRoman", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1252", ], alphabet="".join(sorted(set(ascii_letters + "ÂÊÎÔÛâêîôûŴŵŶŷ"))), num_training_docs=94, num_training_chars=58_047, ), "Lithuanian": Language( name="Lithuanian", iso_code="lt", use_ascii=True, charsets=[ "CP775", "ISO-8859-4", "ISO-8859-13", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1257", ], # Include Q, W, X for loanwords alphabet="".join(sorted(set(ascii_letters + "ąčęėįšųūžĄČĘĖĮŠŲŪŽ"))), num_training_docs=73_445, num_training_chars=300_008_498, ), "Latvian": Language( name="Latvian", iso_code="lv", use_ascii=True, charsets=[ "CP775", "ISO-8859-4", "ISO-8859-13", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1257", ], # Include Q, W, X, Y for loanwords alphabet="".join(sorted(set(ascii_letters + "āčēģīķļņšūžĀČĒĢĪĶĻŅŠŪŽ"))), num_training_docs=71_628, num_training_chars=300_007_767, ), "Macedonian": Language( name="Macedonian", iso_code="mk", use_ascii=False, charsets=[ "CP855", "CP866", "ISO-8859-5", "KOI8-R", "MacCyrillic", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1251", ], alphabet="АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШабвгдѓежзѕијклљмнњопрстќуфхцчџш", num_training_docs=102_724, num_training_chars=300_009_729, ), "Maltese": Language( name="Maltese", iso_code="mt", use_ascii=True, charsets=[ "ISO-8859-3", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", ], # Include Y for loanwords alphabet="".join(sorted(set(ascii_letters + "ċġħżĊĠĦŻ"))), num_training_docs=51_488, num_training_chars=300_001_960, ), "Malay": Language( name="Malay", iso_code="ms", use_ascii=True, charsets=[ "CP037", "CP437", "CP500", "CP850", "CP858", "ISO-8859-1", "ISO-8859-15", "MacRoman", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1252", ], alphabet=ascii_letters, num_training_docs=238_150, num_training_chars=93_983_043, ), "Dutch": Language( name="Dutch", iso_code="nl", use_ascii=True, charsets=[ "CP037", "CP437", "CP500", "CP850", "CP858", "ISO-8859-1", "ISO-8859-15", "MacRoman", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1252", ], # Include loanword characters commonly used in Dutch alphabet="".join(sorted(set(ascii_letters + "àâçèéêëïîñôùûœÀÂÇÈÉÊËÏÎÑÔÙÛŒ"))), num_training_docs=107_675, num_training_chars=300_000_260, ), "Norwegian": Language( name="Norwegian", iso_code="no", use_ascii=True, charsets=[ "CP037", "CP437", "CP500", "CP850", "CP858", "CP865", "ISO-8859-1", "ISO-8859-15", "MacRoman", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1252", ], # Include Q, W, X, Z and common loanword characters alphabet="".join(sorted(set(ascii_letters + "ÆØÅæøå"))), num_training_docs=66_762, num_training_chars=300_001_076, ), "Polish": Language( name="Polish", iso_code="pl", use_ascii=True, charsets=[ "CP852", "ISO-8859-2", "ISO-8859-16", "MacLatin2", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1250", ], # Include Q, X for loanwords alphabet="".join(sorted(set(ascii_letters + "ąćęłńóśźżĄĆĘŁŃÓŚŹŻ"))), num_training_docs=97_060, num_training_chars=300_001_942, ), "Portuguese": Language( name="Portuguese", iso_code="pt", use_ascii=True, charsets=[ "CP037", "CP437", "CP500", "CP850", "CP858", "CP860", "ISO-8859-1", "ISO-8859-15", "MacRoman", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1252", ], alphabet="".join(sorted(set(ascii_letters + "ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú"))), num_training_docs=101_817, num_training_chars=300_001_295, ), "Romanian": Language( name="Romanian", iso_code="ro", use_ascii=True, charsets=[ "CP852", "ISO-8859-2", "ISO-8859-16", "MacLatin2", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1250", ], alphabet="".join(sorted(set(ascii_letters + "ăâîșțĂÂÎȘȚ"))), num_training_docs=78_976, num_training_chars=300_001_970, ), "Russian": Language( name="Russian", iso_code="ru", use_ascii=False, charsets=[ "CP855", "CP866", "ISO-8859-5", "KOI8-R", "MacCyrillic", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1251", ], alphabet="абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ", num_training_docs=85_054, num_training_chars=300_001_344, ), "Slovak": Language( name="Slovak", iso_code="sk", use_ascii=True, charsets=[ "CP852", "ISO-8859-2", "ISO-8859-16", "MacLatin2", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1250", ], alphabet="".join( sorted(set(ascii_letters + "áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ")) ), num_training_docs=95_223, num_training_chars=300_000_329, ), "Slovene": Language( name="Slovene", iso_code="sl", use_ascii=True, charsets=[ "CP852", "ISO-8859-2", "ISO-8859-16", "MacLatin2", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1250", ], # Include Q, W, X, Y for loanwords alphabet="".join(sorted(set(ascii_letters + "蚞ȊŽ"))), num_training_docs=66_688, num_training_chars=300_002_768, ), # Serbian can be written in both Latin and Cyrillic, but there's no # simple way to get the Latin alphabet pages from Wikipedia through # the API, so for now we just support Cyrillic. The Latin alphabet # is the same as Croatian, so we reuse that language model to handle # Latin-Serbian text. "Serbian": Language( name="Serbian", iso_code="sr", use_ascii=False, alphabet="АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШабвгдђежзијклљмнњопрстћуфхцчџш", charsets=[ "CP855", "CP866", "ISO-8859-5", "KOI8-R", "MacCyrillic", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1251", ], num_training_docs=66_918, num_training_chars=300_000_904, ), "Swedish": Language( name="Swedish", iso_code="sv", use_ascii=True, charsets=[ "CP037", "CP437", "CP500", "CP850", "CP858", "ISO-8859-1", "ISO-8859-15", "MacRoman", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1252", ], # Include Q, W, Z (loanwords) and common accented characters alphabet="".join(sorted(set(ascii_letters + "ÅÄÖåäö"))), num_training_docs=96_485, num_training_chars=300_013_381, ), "Tajik": Language( name="Tajik", iso_code="tg", use_ascii=False, charsets=[ "KOI8-T", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", ], alphabet="АБВГҒДЕЁЖЗИӢЙКҚЛМНОПРСТУӮФХҲЧҶШЪЭЮЯабвгғдеёжзиӣйкқлмнопрстуӯфхҳчҷшъэюя", num_training_docs=74_865, num_training_chars=300_022_133, ), "Thai": Language( name="Thai", iso_code="th", use_ascii=False, charsets=[ "CP874", "ISO-8859-11", "TIS-620", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", ], alphabet="กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛", num_training_docs=109_133, num_training_chars=300_008_106, ), "Turkish": Language( name="Turkish", iso_code="tr", use_ascii=True, charsets=[ "CP857", "CP1026", "ISO-8859-3", "ISO-8859-9", "MacTurkish", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1254", ], # Include Q, W, X for loanwords alphabet="".join(sorted(set(ascii_letters + "çğıiöşüâîûÇĞIİÖŞÜÂÎÛ"))), num_training_docs=107_848, num_training_chars=300_001_308, ), "Ukrainian": Language( name="Ukrainian", iso_code="uk", use_ascii=False, charsets=[ "CP855", "CP866", "CP1125", "ISO-8859-5", "KOI8-R", "KOI8-U", "MacCyrillic", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1251", ], alphabet="АБВГҐДЕЄЖЗИІЇЙКЛМНОПРСТУФХЦЧШЩЬЮЯабвгґдеєжзиіїйклмнопрстуфхцчшщьюяʼ", num_training_docs=95_019, num_training_chars=300_048_946, ), "Urdu": Language( name="Urdu", iso_code="ur", use_ascii=False, charsets=[ "CP1006", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", ], alphabet="ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيپچڈڑژکگںھۂۃیے", num_training_docs=82_296, num_training_chars=300_001_141, ), "Vietnamese": Language( name="Vietnamese", iso_code="vi", use_ascii=False, # Windows-1258 is the only common 8-bit # Vietnamese encoding supported by Python. # From Wikipedia: # For systems that lack support for Unicode, # dozens of 8-bit Vietnamese code pages are # available.[1] The most common are VISCII # (TCVN 5712:1993), VPS, and Windows-1258.[3] # Where ASCII is required, such as when # ensuring readability in plain text e-mail, # Vietnamese letters are often encoded # according to Vietnamese Quoted-Readable # (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4] # though usage of either variable-width # scheme has declined dramatically following # the adoption of Unicode on the World Wide # Web. charsets=[ "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", "WINDOWS-1258", ], # Alphabet includes base letters and combining tone marks used by Windows-1258: # - Base letters: a, ă, â, e, ê, o, ô, ơ, u, ư, etc. # - Combining marks: grave (̀), acute (́), tilde (̃), hook above (̉), dot below (̣) alphabet="aăâbcdđeêghiklmnoôơpqrstuưvxyAĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY\u0300\u0301\u0303\u0309\u0323", num_training_docs=85_692, num_training_chars=300_000_871, ), "Chinese": Language( name="Chinese", iso_code="zh", use_ascii=False, charsets=[ "BIG5", "GB18030", "HZ-GB-2312", "UTF-8", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE", ], alphabet="", num_training_docs=None, num_training_chars=None, ), }