"""
Metadata about languages used by our model training code for our
SingleByteCharSetProbers. Could be used for other things in the future.
This code was originally based on the language metadata from the uchardet
project.
"""
from dataclasses import dataclass
from string import ascii_letters
[docs]
@dataclass(frozen=True)
class Language:
"""Metadata about a language useful for training models
:ivar name: The human name for the language, in English.
:type name: str
:ivar iso_code: 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
or use another catalog as a last resort.
:type iso_code: str
:ivar use_ascii: Whether or not ASCII letters should be included in trained
models.
:type use_ascii: bool
:ivar charsets: The charsets we want to support and create data for.
:type charsets: list of str
:ivar alphabet: The characters in the language's alphabet. If `use_ascii` is
`True`, you only need to add those not in the ASCII set.
:type alphabet: str
:ivar num_training_docs: Number of documents from CulturaX to use for training.
This represents approximately 300M characters of training data.
None means the count hasn't been determined yet.
:type num_training_docs: int or None
:ivar num_training_chars: Number of characters from CulturaX used for training.
The goal is for this to be at least 300M characters, but some
languages may not have that much data available.
None means the count hasn't been determined yet.
:type num_training_chars: int or None
"""
name: str
iso_code: str
use_ascii: bool
charsets: list[str]
alphabet: str
num_training_docs: int | None = None
num_training_chars: int | None = None
def __repr__(self) -> str:
param_str = ", ".join(
f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
)
return f"{self.__class__.__name__}({param_str})"
LANGUAGES = {
"Arabic": Language(
name="Arabic",
iso_code="ar",
use_ascii=False,
charsets=[
"CP720",
"CP864",
"ISO-8859-6",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1256",
],
alphabet="ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ",
num_training_docs=98_634,
num_training_chars=300_001_734,
),
"Belarusian": Language(
name="Belarusian",
iso_code="be",
use_ascii=False,
charsets=[
"CP855",
"CP866",
"ISO-8859-5",
"KOI8-R",
"MacCyrillic",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1251",
],
alphabet="АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯабвгдеёжзійклмнопрстуўфхцчшыьэюяʼ",
num_training_docs=77_015,
num_training_chars=300_003_525,
),
"Bulgarian": Language(
name="Bulgarian",
iso_code="bg",
use_ascii=False,
charsets=[
"CP855",
"CP866",
"ISO-8859-5",
"KOI8-R",
"MacCyrillic",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1251",
],
alphabet="АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя",
num_training_docs=84_186,
num_training_chars=300_001_428,
),
"Breton": Language(
name="Breton",
iso_code="br",
use_ascii=True,
charsets=[
"CP037",
"CP437",
"CP500",
"CP850",
"CP858",
"ISO-8859-1",
"ISO-8859-14",
"ISO-8859-15",
"MacRoman",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1252",
],
alphabet="".join(sorted(set(ascii_letters + "ÀÂÈÊÎÔÙÛàâèêîôùû"))),
num_training_docs=43_761,
num_training_chars=39_597_819,
),
"Welsh": Language(
name="Welsh",
iso_code="cy",
use_ascii=True,
charsets=[
"CP037",
"CP437",
"CP500",
"CP850",
"CP858",
"ISO-8859-1",
"ISO-8859-14",
"ISO-8859-15",
"MacRoman",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1252",
],
# ẀẂẄ never seem to occur in training data, so don't include them
alphabet="".join(
sorted(set(ascii_letters + "ÁÂÄÉÊËÍÎÏÓÔÖÚÛÜÝáâäéêëíîïóôöúûüýÿŴŵŶŷŸẁẃẅỲỳ"))
),
num_training_docs=78_726,
num_training_chars=300_014_737,
),
"Czech": Language(
name="Czech",
iso_code="cs",
use_ascii=True,
charsets=[
"CP852",
"ISO-8859-2",
"ISO-8859-16",
"MacLatin2",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1250",
],
alphabet="".join(sorted(set(ascii_letters + "áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ"))),
num_training_docs=98_820,
num_training_chars=300_005_076,
),
"Danish": Language(
name="Danish",
iso_code="da",
use_ascii=True,
charsets=[
"CP037",
"CP437",
"CP500",
"CP850",
"CP858",
"CP865",
"ISO-8859-1",
"ISO-8859-15",
"MacRoman",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1252",
],
alphabet="".join(sorted(set(ascii_letters + "æøåÆØÅ"))),
num_training_docs=83_466,
num_training_chars=300_004_040,
),
"German": Language(
name="German",
iso_code="de",
use_ascii=True,
charsets=[
"CP037",
"CP437",
"CP500",
"CP850",
"CP858",
"ISO-8859-1",
"ISO-8859-15",
"MacRoman",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1252",
],
alphabet="".join(sorted(set(ascii_letters + "äöüßẞÄÖÜ"))),
num_training_docs=83_754,
num_training_chars=300_002_572,
),
"Greek": Language(
name="Greek",
iso_code="el",
use_ascii=False,
charsets=[
"CP737",
"CP869",
"CP875",
"ISO-8859-7",
"MacGreek",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1253",
],
alphabet="αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ",
num_training_docs=103_810,
num_training_chars=300_003_051,
),
"English": Language(
name="English",
iso_code="en",
alphabet=ascii_letters,
use_ascii=True,
charsets=[
"CP037",
"CP437",
"CP500",
"CP850",
"CP858",
"ISO-8859-1",
"ISO-8859-15",
"MacRoman",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1252",
],
num_training_docs=84_511,
num_training_chars=300_004_447,
),
"Esperanto": Language(
name="Esperanto",
iso_code="eo",
use_ascii=True,
charsets=[
"ISO-8859-3",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
],
# Include Q, W, X, Y for loanwords
alphabet="".join(sorted(set(ascii_letters + "ĉĝĥĵŝŭĈĜĤĴŜŬ"))),
num_training_docs=40_441,
num_training_chars=300_001_893,
),
"Spanish": Language(
name="Spanish",
iso_code="es",
use_ascii=True,
charsets=[
"CP037",
"CP437",
"CP500",
"CP850",
"CP858",
"ISO-8859-1",
"ISO-8859-15",
"MacRoman",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1252",
],
alphabet="".join(sorted(set(ascii_letters + "ñáéíóúüÑÁÉÍÓÚÜ"))),
num_training_docs=87_069,
num_training_chars=300_000_884,
),
"Estonian": Language(
name="Estonian",
iso_code="et",
use_ascii=True,
charsets=[
"CP775",
"ISO-8859-4",
"ISO-8859-13",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1257",
],
# Include C, F, Š, Q, W, X, Y, Z, Ž for loanwords
alphabet="".join(sorted(set(ascii_letters + "õäöüšžÕÄÖÜŠŽ"))),
num_training_docs=66_818,
num_training_chars=300_000_765,
),
"Farsi": Language(
name="Farsi",
iso_code="fa",
use_ascii=False,
charsets=[
"CP720",
"CP864",
"ISO-8859-6",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1256",
],
alphabet="ءآأؤإئابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهیًٌٍَُِّ",
num_training_docs=104_835,
num_training_chars=300_001_684,
),
"Finnish": Language(
name="Finnish",
iso_code="fi",
use_ascii=True,
charsets=[
"CP037",
"CP437",
"CP500",
"CP850",
"CP858",
"ISO-8859-1",
"ISO-8859-15",
"MacRoman",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1252",
],
alphabet="".join(sorted(set(ascii_letters + "ÅÄÖŠŽåäöšž"))),
num_training_docs=80_361,
num_training_chars=300_001_375,
),
"French": Language(
name="French",
iso_code="fr",
use_ascii=True,
charsets=[
"CP037",
"CP437",
"CP500",
"CP850",
"CP858",
"CP863",
"ISO-8859-1",
"ISO-8859-15",
"MacRoman",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1252",
],
alphabet="".join(
sorted(set(ascii_letters + "àâæçèéêëîïôùûüÿÀÂÆÇÈÉÊËÎÏÔÙÛÜŸŒœ"))
),
num_training_docs=88_369,
num_training_chars=300_003_422,
),
"Irish": Language(
name="Irish",
iso_code="ga",
use_ascii=True,
charsets=[
"CP037",
"CP437",
"CP500",
"CP850",
"CP858",
"ISO-8859-1",
"ISO-8859-14",
"ISO-8859-15",
"MacRoman",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1252",
],
alphabet="".join(sorted(set(ascii_letters + "ÁÉÍÓÚáéíóú"))),
num_training_docs=63_468,
num_training_chars=300_005_169,
),
"Scottish Gaelic": Language(
name="Scottish Gaelic",
iso_code="gd",
use_ascii=True,
charsets=[
"CP037",
"CP437",
"CP500",
"CP850",
"CP858",
"ISO-8859-1",
"ISO-8859-14",
"ISO-8859-15",
"MacRoman",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1252",
],
alphabet="".join(sorted(set(ascii_letters + "ÀÈÌÒÙàèìòù"))),
num_training_docs=8_408,
num_training_chars=15_116_797,
),
"Hebrew": Language(
name="Hebrew",
iso_code="he",
use_ascii=False,
charsets=[
"CP424",
"CP856",
"CP862",
"ISO-8859-8",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1255",
],
alphabet="אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ",
num_training_docs=58265,
num_training_chars=300_009_414,
),
"Croatian": Language(
name="Croatian",
iso_code="hr",
use_ascii=True,
charsets=[
"CP852",
"ISO-8859-2",
"ISO-8859-16",
"MacLatin2",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1250",
],
# Include Q, W, X, Y for loanwords
alphabet="".join(sorted(set(ascii_letters + "čćđšžČĆĐŠŽ"))),
num_training_docs=460_689,
num_training_chars=157_579_665,
),
"Hungarian": Language(
name="Hungarian",
iso_code="hu",
use_ascii=True,
charsets=[
"CP852",
"ISO-8859-2",
"ISO-8859-16",
"MacLatin2",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1250",
],
# Include Q, W, X, Y for loanwords
alphabet="".join(sorted(set(ascii_letters + "áéíóöőúüűÁÉÍÓÖŐÚÜŰ"))),
num_training_docs=82_417,
num_training_chars=300_001_846,
),
"Icelandic": Language(
name="Icelandic",
iso_code="is",
use_ascii=True,
charsets=[
"CP037",
"CP437",
"CP500",
"CP850",
"CP858",
"CP861",
"ISO-8859-1",
"ISO-8859-10",
"ISO-8859-15",
"MacIceland",
"MacRoman",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1252",
],
# Include Q, W for loanwords
alphabet="".join(sorted(set(ascii_letters + "áðéíóúýþæöÁÐÉÍÓÚÝÞÆÖ"))),
num_training_docs=77_487,
num_training_chars=300_004_354,
),
"Indonesian": Language(
name="Indonesian",
iso_code="id",
use_ascii=True,
charsets=[
"CP037",
"CP437",
"CP500",
"CP850",
"CP858",
"ISO-8859-1",
"ISO-8859-15",
"MacRoman",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1252",
],
alphabet=ascii_letters,
num_training_docs=114_888,
num_training_chars=300_000_301,
),
"Italian": Language(
name="Italian",
iso_code="it",
use_ascii=True,
charsets=[
"CP037",
"CP437",
"CP500",
"CP850",
"CP858",
"ISO-8859-1",
"ISO-8859-15",
"MacRoman",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1252",
],
alphabet="".join(sorted(set(ascii_letters + "ÀÈÉÌÒÓÙàèéìòóù"))),
num_training_docs=92_388,
num_training_chars=300_081_924,
),
"Japanese": Language(
name="Japanese",
iso_code="ja",
use_ascii=False,
charsets=[
"CP932",
"EUC-JP",
"ISO-2022-JP",
"SHIFT-JIS",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
],
alphabet="",
num_training_docs=None,
num_training_chars=None,
),
"Kazakh": Language(
name="Kazakh",
iso_code="kk",
use_ascii=False,
charsets=[
"KZ1048",
"PTCP154",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
],
alphabet="АӘБВГҒДЕЁЖЗИЙКҚЛМНҢОӨПРСТУҰҮФХҺЦЧШЩЪЫІЬЭЮЯаәбвгғдеёжзийкқлмнңоөпрстууұүфхһцчшщъыіьэюя",
num_training_docs=73_160,
num_training_chars=300_002_618,
),
"Korean": Language(
name="Korean",
iso_code="ko",
use_ascii=False,
charsets=[
"CP949",
"EUC-KR",
"ISO-2022-KR",
"JOHAB",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
],
alphabet="",
num_training_docs=None,
num_training_chars=None,
),
"Cornish": Language(
name="Cornish",
iso_code="kw",
use_ascii=True,
charsets=[
"CP037",
"CP437",
"CP500",
"CP850",
"CP858",
"ISO-8859-1",
"ISO-8859-14",
"ISO-8859-15",
"MacRoman",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1252",
],
alphabet="".join(sorted(set(ascii_letters + "ÂÊÎÔÛâêîôûŴŵŶŷ"))),
num_training_docs=94,
num_training_chars=58_047,
),
"Lithuanian": Language(
name="Lithuanian",
iso_code="lt",
use_ascii=True,
charsets=[
"CP775",
"ISO-8859-4",
"ISO-8859-13",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1257",
],
# Include Q, W, X for loanwords
alphabet="".join(sorted(set(ascii_letters + "ąčęėįšųūžĄČĘĖĮŠŲŪŽ"))),
num_training_docs=73_445,
num_training_chars=300_008_498,
),
"Latvian": Language(
name="Latvian",
iso_code="lv",
use_ascii=True,
charsets=[
"CP775",
"ISO-8859-4",
"ISO-8859-13",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1257",
],
# Include Q, W, X, Y for loanwords
alphabet="".join(sorted(set(ascii_letters + "āčēģīķļņšūžĀČĒĢĪĶĻŅŠŪŽ"))),
num_training_docs=71_628,
num_training_chars=300_007_767,
),
"Macedonian": Language(
name="Macedonian",
iso_code="mk",
use_ascii=False,
charsets=[
"CP855",
"CP866",
"ISO-8859-5",
"KOI8-R",
"MacCyrillic",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1251",
],
alphabet="АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШабвгдѓежзѕијклљмнњопрстќуфхцчџш",
num_training_docs=102_724,
num_training_chars=300_009_729,
),
"Maltese": Language(
name="Maltese",
iso_code="mt",
use_ascii=True,
charsets=[
"ISO-8859-3",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
],
# Include Y for loanwords
alphabet="".join(sorted(set(ascii_letters + "ċġħżĊĠĦŻ"))),
num_training_docs=51_488,
num_training_chars=300_001_960,
),
"Malay": Language(
name="Malay",
iso_code="ms",
use_ascii=True,
charsets=[
"CP037",
"CP437",
"CP500",
"CP850",
"CP858",
"ISO-8859-1",
"ISO-8859-15",
"MacRoman",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1252",
],
alphabet=ascii_letters,
num_training_docs=238_150,
num_training_chars=93_983_043,
),
"Dutch": Language(
name="Dutch",
iso_code="nl",
use_ascii=True,
charsets=[
"CP037",
"CP437",
"CP500",
"CP850",
"CP858",
"ISO-8859-1",
"ISO-8859-15",
"MacRoman",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1252",
],
# Include loanword characters commonly used in Dutch
alphabet="".join(sorted(set(ascii_letters + "àâçèéêëïîñôùûœÀÂÇÈÉÊËÏÎÑÔÙÛŒ"))),
num_training_docs=107_675,
num_training_chars=300_000_260,
),
"Norwegian": Language(
name="Norwegian",
iso_code="no",
use_ascii=True,
charsets=[
"CP037",
"CP437",
"CP500",
"CP850",
"CP858",
"CP865",
"ISO-8859-1",
"ISO-8859-15",
"MacRoman",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1252",
],
# Include Q, W, X, Z and common loanword characters
alphabet="".join(sorted(set(ascii_letters + "ÆØÅæøå"))),
num_training_docs=66_762,
num_training_chars=300_001_076,
),
"Polish": Language(
name="Polish",
iso_code="pl",
use_ascii=True,
charsets=[
"CP852",
"ISO-8859-2",
"ISO-8859-16",
"MacLatin2",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1250",
],
# Include Q, X for loanwords
alphabet="".join(sorted(set(ascii_letters + "ąćęłńóśźżĄĆĘŁŃÓŚŹŻ"))),
num_training_docs=97_060,
num_training_chars=300_001_942,
),
"Portuguese": Language(
name="Portuguese",
iso_code="pt",
use_ascii=True,
charsets=[
"CP037",
"CP437",
"CP500",
"CP850",
"CP858",
"CP860",
"ISO-8859-1",
"ISO-8859-15",
"MacRoman",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1252",
],
alphabet="".join(sorted(set(ascii_letters + "ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú"))),
num_training_docs=101_817,
num_training_chars=300_001_295,
),
"Romanian": Language(
name="Romanian",
iso_code="ro",
use_ascii=True,
charsets=[
"CP852",
"ISO-8859-2",
"ISO-8859-16",
"MacLatin2",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1250",
],
alphabet="".join(sorted(set(ascii_letters + "ăâîșțĂÂÎȘȚ"))),
num_training_docs=78_976,
num_training_chars=300_001_970,
),
"Russian": Language(
name="Russian",
iso_code="ru",
use_ascii=False,
charsets=[
"CP855",
"CP866",
"ISO-8859-5",
"KOI8-R",
"MacCyrillic",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1251",
],
alphabet="абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ",
num_training_docs=85_054,
num_training_chars=300_001_344,
),
"Slovak": Language(
name="Slovak",
iso_code="sk",
use_ascii=True,
charsets=[
"CP852",
"ISO-8859-2",
"ISO-8859-16",
"MacLatin2",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1250",
],
alphabet="".join(
sorted(set(ascii_letters + "áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ"))
),
num_training_docs=95_223,
num_training_chars=300_000_329,
),
"Slovene": Language(
name="Slovene",
iso_code="sl",
use_ascii=True,
charsets=[
"CP852",
"ISO-8859-2",
"ISO-8859-16",
"MacLatin2",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1250",
],
# Include Q, W, X, Y for loanwords
alphabet="".join(sorted(set(ascii_letters + "蚞ȊŽ"))),
num_training_docs=66_688,
num_training_chars=300_002_768,
),
# Serbian can be written in both Latin and Cyrillic, but there's no
# simple way to get the Latin alphabet pages from Wikipedia through
# the API, so for now we just support Cyrillic. The Latin alphabet
# is the same as Croatian, so we reuse that language model to handle
# Latin-Serbian text.
"Serbian": Language(
name="Serbian",
iso_code="sr",
use_ascii=False,
alphabet="АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШабвгдђежзијклљмнњопрстћуфхцчџш",
charsets=[
"CP855",
"CP866",
"ISO-8859-5",
"KOI8-R",
"MacCyrillic",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1251",
],
num_training_docs=66_918,
num_training_chars=300_000_904,
),
"Swedish": Language(
name="Swedish",
iso_code="sv",
use_ascii=True,
charsets=[
"CP037",
"CP437",
"CP500",
"CP850",
"CP858",
"ISO-8859-1",
"ISO-8859-15",
"MacRoman",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1252",
],
# Include Q, W, Z (loanwords) and common accented characters
alphabet="".join(sorted(set(ascii_letters + "ÅÄÖåäö"))),
num_training_docs=96_485,
num_training_chars=300_013_381,
),
"Tajik": Language(
name="Tajik",
iso_code="tg",
use_ascii=False,
charsets=[
"KOI8-T",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
],
alphabet="АБВГҒДЕЁЖЗИӢЙКҚЛМНОПРСТУӮФХҲЧҶШЪЭЮЯабвгғдеёжзиӣйкқлмнопрстуӯфхҳчҷшъэюя",
num_training_docs=74_865,
num_training_chars=300_022_133,
),
"Thai": Language(
name="Thai",
iso_code="th",
use_ascii=False,
charsets=[
"CP874",
"ISO-8859-11",
"TIS-620",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
],
alphabet="กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛",
num_training_docs=109_133,
num_training_chars=300_008_106,
),
"Turkish": Language(
name="Turkish",
iso_code="tr",
use_ascii=True,
charsets=[
"CP857",
"CP1026",
"ISO-8859-3",
"ISO-8859-9",
"MacTurkish",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1254",
],
# Include Q, W, X for loanwords
alphabet="".join(sorted(set(ascii_letters + "çğıiöşüâîûÇĞIİÖŞÜÂÎÛ"))),
num_training_docs=107_848,
num_training_chars=300_001_308,
),
"Ukrainian": Language(
name="Ukrainian",
iso_code="uk",
use_ascii=False,
charsets=[
"CP855",
"CP866",
"CP1125",
"ISO-8859-5",
"KOI8-R",
"KOI8-U",
"MacCyrillic",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1251",
],
alphabet="АБВГҐДЕЄЖЗИІЇЙКЛМНОПРСТУФХЦЧШЩЬЮЯабвгґдеєжзиіїйклмнопрстуфхцчшщьюяʼ",
num_training_docs=95_019,
num_training_chars=300_048_946,
),
"Urdu": Language(
name="Urdu",
iso_code="ur",
use_ascii=False,
charsets=[
"CP1006",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
],
alphabet="ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيپچڈڑژکگںھۂۃیے",
num_training_docs=82_296,
num_training_chars=300_001_141,
),
"Vietnamese": Language(
name="Vietnamese",
iso_code="vi",
use_ascii=False,
# Windows-1258 is the only common 8-bit
# Vietnamese encoding supported by Python.
# From Wikipedia:
# For systems that lack support for Unicode,
# dozens of 8-bit Vietnamese code pages are
# available.[1] The most common are VISCII
# (TCVN 5712:1993), VPS, and Windows-1258.[3]
# Where ASCII is required, such as when
# ensuring readability in plain text e-mail,
# Vietnamese letters are often encoded
# according to Vietnamese Quoted-Readable
# (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4]
# though usage of either variable-width
# scheme has declined dramatically following
# the adoption of Unicode on the World Wide
# Web.
charsets=[
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
"WINDOWS-1258",
],
# Alphabet includes base letters and combining tone marks used by Windows-1258:
# - Base letters: a, ă, â, e, ê, o, ô, ơ, u, ư, etc.
# - Combining marks: grave (̀), acute (́), tilde (̃), hook above (̉), dot below (̣)
alphabet="aăâbcdđeêghiklmnoôơpqrstuưvxyAĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY\u0300\u0301\u0303\u0309\u0323",
num_training_docs=85_692,
num_training_chars=300_000_871,
),
"Chinese": Language(
name="Chinese",
iso_code="zh",
use_ascii=False,
charsets=[
"BIG5",
"GB18030",
"HZ-GB-2312",
"UTF-8",
"UTF-16",
"UTF-16BE",
"UTF-16LE",
"UTF-32",
"UTF-32BE",
"UTF-32LE",
],
alphabet="",
num_training_docs=None,
num_training_chars=None,
),
}