| 1 | #!/usr/bin/env python
|
|---|
| 2 |
|
|---|
| 3 | import argparse
|
|---|
| 4 | import os.path
|
|---|
| 5 | from glob import glob
|
|---|
| 6 |
|
|---|
| 7 | from charset_normalizer import from_fp
|
|---|
| 8 | from charset_normalizer.models import CliDetectionResult
|
|---|
| 9 |
|
|---|
| 10 | def main(argv=None):
|
|---|
| 11 | parser = argparse.ArgumentParser(
|
|---|
| 12 | description="change file encoding to UTF-8",
|
|---|
| 13 | formatter_class=argparse.RawTextHelpFormatter
|
|---|
| 14 | )
|
|---|
| 15 |
|
|---|
| 16 | parser.add_argument(
|
|---|
| 17 | "infile", metavar="input file(s)",
|
|---|
| 18 | type=str,
|
|---|
| 19 | nargs="*",
|
|---|
| 20 | help="files",
|
|---|
| 21 | )
|
|---|
| 22 |
|
|---|
| 23 | print_group = parser.add_mutually_exclusive_group()
|
|---|
| 24 | print_group.add_argument(
|
|---|
| 25 | "-p", "--print-only",
|
|---|
| 26 | action="store_true",
|
|---|
| 27 | default=False,
|
|---|
| 28 | dest="print_only",
|
|---|
| 29 | help="only print file encoding",
|
|---|
| 30 | )
|
|---|
| 31 |
|
|---|
| 32 | print_group.add_argument(
|
|---|
| 33 | "-q", "--quiet",
|
|---|
| 34 | action="store_true",
|
|---|
| 35 | default=False,
|
|---|
| 36 | dest="quiet",
|
|---|
| 37 | help="do not print encodings",
|
|---|
| 38 | )
|
|---|
| 39 |
|
|---|
| 40 | args = parser.parse_args(argv)
|
|---|
| 41 |
|
|---|
| 42 | for fglob in args.infile:
|
|---|
| 43 | for fname in glob(fglob):
|
|---|
| 44 | fname = os.path.abspath(fname)
|
|---|
| 45 | if not os.path.isfile(fname):
|
|---|
| 46 | continue
|
|---|
| 47 | with open(fname, "rb") as f:
|
|---|
| 48 | matches = from_fp(f, threshold=0.2, explain=False)
|
|---|
| 49 | # best guess is None if there's no good match
|
|---|
| 50 | best_guess = matches.best()
|
|---|
| 51 | if not best_guess:
|
|---|
| 52 | if not args.quiet:
|
|---|
| 53 | print(fname, "\tno guess")
|
|---|
| 54 | continue
|
|---|
| 55 | if not args.quiet:
|
|---|
| 56 | print(fname, "\t", best_guess.encoding)
|
|---|
| 57 | if args.print_only:
|
|---|
| 58 | continue
|
|---|
| 59 | if any(best_guess.encoding == x for x in ["utf_8", "ascii"]):
|
|---|
| 60 | continue
|
|---|
| 61 | # try to read the file using the guessed encoding
|
|---|
| 62 | # then try to write a new file
|
|---|
| 63 | print("converting %s to UTF-8" % fname)
|
|---|
| 64 | with open(fname, "r", encoding=best_guess.encoding) as f:
|
|---|
| 65 | data = f.read()
|
|---|
| 66 |
|
|---|
| 67 | name, ext = os.path.splitext(fname)
|
|---|
| 68 | new_name = "%s-utf_8%s" % (name, ext)
|
|---|
| 69 | print("writing to", new_name)
|
|---|
| 70 | with open(new_name, "w", encoding="utf-8") as f:
|
|---|
| 71 | f.write(data)
|
|---|
| 72 |
|
|---|
| 73 |
|
|---|
| 74 |
|
|---|
| 75 | if __name__ == "__main__":
|
|---|
| 76 | main()
|
|---|