"""
Script which takes one or more file paths and reports on their detected
encodings
Example::
% chardetect somefile someotherfile
somefile: windows-1252 with confidence 0.5
someotherfile: ascii with confidence 1.0
If no paths are provided, it takes its input from stdin.
"""
import argparse
import sys
from collections.abc import Iterable
from typing import Optional
from .. import __version__
from ..enums import EncodingEra
from ..universaldetector import UniversalDetector
[docs]
def description_of(
lines: Iterable[bytes],
name: str = "stdin",
minimal: bool = False,
should_rename_legacy: bool = False,
encoding_era: EncodingEra = EncodingEra.MODERN_WEB,
) -> Optional[str]:
"""
Return a string describing the probable encoding of a file or
list of strings.
:param lines: The lines to get the encoding of.
:type lines: Iterable of bytes
:param name: Name of file or collection of lines
:type name: str
:param should_rename_legacy: Should we rename legacy encodings to
their more modern equivalents?
:type should_rename_legacy: ``bool``
:param encoding_era: Which era of encodings to consider during detection.
:type encoding_era: ``EncodingEra``
"""
u = UniversalDetector(
should_rename_legacy=should_rename_legacy, encoding_era=encoding_era
)
for line in lines:
line = bytearray(line)
u.feed(line)
# shortcut out of the loop to save reading further - particularly useful if we read a BOM.
if u.done:
break
u.close()
result = u.result
if minimal:
return result["encoding"]
if result["encoding"]:
return f"{name}: {result['encoding']} with confidence {result['confidence']}"
return f"{name}: no result"
[docs]
def main(argv: Optional[list[str]] = None) -> None:
"""
Handles command line arguments and gets things started.
:param argv: List of arguments, as if specified on the command-line.
If None, ``sys.argv[1:]`` is used instead.
:type argv: list of str
"""
# Get command line arguments
parser = argparse.ArgumentParser(
description=(
"Takes one or more file paths and reports their detected encodings"
)
)
parser.add_argument(
"input",
help="File whose encoding we would like to determine. (default: stdin)",
type=argparse.FileType("rb"),
nargs="*",
default=[sys.stdin.buffer],
)
parser.add_argument(
"--minimal",
help="Print only the encoding to standard output",
action="store_true",
)
parser.add_argument(
"-l",
"--legacy",
help="Rename legacy encodings to more modern ones.",
action="store_true",
)
era_names = [e.name for e in EncodingEra if e.name is not None and e.name != "ALL"]
parser.add_argument(
"-e",
"--encoding-era",
help="Which era of encodings to consider (default: MODERN_WEB). "
f"Choices: {', '.join(era_names)}, ALL",
default="MODERN_WEB",
type=str.upper,
)
parser.add_argument(
"--version", action="version", version=f"%(prog)s {__version__}"
)
args = parser.parse_args(argv)
try:
encoding_era = EncodingEra[args.encoding_era]
except KeyError:
parser.error(
f"invalid encoding era: {args.encoding_era!r}. "
f"Choose from: {', '.join(era_names)}, ALL"
)
for f in args.input:
if f.isatty():
print(
"You are running chardetect interactively. Press "
"CTRL-D twice at the start of a blank line to signal the "
"end of your input. If you want help, run chardetect "
"--help\n",
file=sys.stderr,
)
print(
description_of(
f,
f.name,
minimal=args.minimal,
should_rename_legacy=args.legacy,
encoding_era=encoding_era,
)
)
if __name__ == "__main__":
main()