Ticket #8746: utf8_convert.py

File utf8_convert.py, 2.3 KB (added by Tony Schaefer, 3 years ago)

Added by email2trac

Line 
1#!/usr/bin/env python
2
3import argparse
4import os.path
5from glob import glob
6
7from charset_normalizer import from_fp
8from charset_normalizer.models import CliDetectionResult
9
10def main(argv=None):
11 parser = argparse.ArgumentParser(
12 description="change file encoding to UTF-8",
13 formatter_class=argparse.RawTextHelpFormatter
14 )
15
16 parser.add_argument(
17 "infile", metavar="input file(s)",
18 type=str,
19 nargs="*",
20 help="files",
21 )
22
23 print_group = parser.add_mutually_exclusive_group()
24 print_group.add_argument(
25 "-p", "--print-only",
26 action="store_true",
27 default=False,
28 dest="print_only",
29 help="only print file encoding",
30 )
31
32 print_group.add_argument(
33 "-q", "--quiet",
34 action="store_true",
35 default=False,
36 dest="quiet",
37 help="do not print encodings",
38 )
39
40 args = parser.parse_args(argv)
41
42 for fglob in args.infile:
43 for fname in glob(fglob):
44 fname = os.path.abspath(fname)
45 if not os.path.isfile(fname):
46 continue
47 with open(fname, "rb") as f:
48 matches = from_fp(f, threshold=0.2, explain=False)
49 # best guess is None if there's no good match
50 best_guess = matches.best()
51 if not best_guess:
52 if not args.quiet:
53 print(fname, "\tno guess")
54 continue
55 if not args.quiet:
56 print(fname, "\t", best_guess.encoding)
57 if args.print_only:
58 continue
59 if any(best_guess.encoding == x for x in ["utf_8", "ascii"]):
60 continue
61 # try to read the file using the guessed encoding
62 # then try to write a new file
63 print("converting %s to UTF-8" % fname)
64 with open(fname, "r", encoding=best_guess.encoding) as f:
65 data = f.read()
66
67 name, ext = os.path.splitext(fname)
68 new_name = "%s-utf_8%s" % (name, ext)
69 print("writing to", new_name)
70 with open(new_name, "w", encoding="utf-8") as f:
71 f.write(data)
72
73
74
75if __name__ == "__main__":
76 main()