forked from titoBouzout/EncodingHelper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEncodingHelper.py
executable file
·332 lines (290 loc) · 12 KB
/
EncodingHelper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
# coding=utf8
import sublime, sublime_plugin
import codecs
import sys
import os
from .chardet.universaldetector import UniversalDetector
import re
import threading
import time
# don't parse binary files, just mark these as binary
BINARY = re.compile('\.(apng|png|jpg|gif|jpeg|bmp|psd|ai|cdr|ico|cache|sublime-package|eot|svgz|ttf|woff|zip|tar|gz|rar|bz2|jar|xpi|mov|mpeg|avi|mpg|flv|wmv|mp3|wav|aif|aiff|snd|wma|asf|asx|pcm|pdf|doc|docx|xls|xlsx|ppt|pptx|rtf|sqlite|sqlitedb|fla|swf|exe)$', re.I);
def plugin_loaded():
global s, Pref
s = sublime.load_settings('EncodingHelper.sublime-settings')
Pref = Pref()
Pref.load();
s.add_on_change('reload', lambda:Pref.load())
EncodingOnStatusBarListener().init_();
class Pref:
def load(self):
import locale
encoding_data_lang, encoding_data_encoding = locale.getdefaultlocale()
Pref.fallback_encodings = []
Pref.fallback_encodings.append("UTF-8")
if encoding_data_encoding:
Pref.fallback_encodings.append(encoding_data_encoding);
for encoding in s.get('fallback_encodings', []):
if encoding != '':
Pref.fallback_encodings.append(encoding.upper())
if not Pref.fallback_encodings or Pref.fallback_encodings == ["UTF-8"]:
Pref.fallback_encodings = ["UTF-8", "ISO-8859-1"];
Pref.open_automatically_as_utf8 = []
for encoding in s.get('open_automatically_as_utf8', []):
if encoding != '':
Pref.open_automatically_as_utf8.append(encoding.upper())
class EncodingOnStatusBarListener(sublime_plugin.EventListener):
def init_(self):
self.on_load(sublime.active_window().active_view())
for window in sublime.windows():
self.on_load(window.active_view())
# this function is called to update the statusbar
# we need to know wich encoding ST is giving to the file in order to tell: "document maybe broken"
# we compare the detected encoding of this package with the detected encoding by ST
def on_encodings_detected(self, v, ok = True):
# we give time to ST to "detect" or use the "fallback encoding".
if v.encoding() == 'Undefined' and v.is_loading():
encoding_sublime = 'Loading…'
elif v.encoding() == 'Undefined' and not v.is_loading():
encoding_sublime = 'UTF-8'
# ok, sublime was able to set some encoding to this file
else:
encoding_sublime = v.encoding()
# here code, "document maybe broken"
encoding_encohelp = v.settings().get('encoding_helper_encoding') or ''
encoding_converted = v.settings().get('encoding_helper_converted') or ''
if encoding_sublime == 'Hexadecimal' and encoding_encohelp == 'BINARY':
encoding_encohelp = ''
if encoding_encohelp == 'Detecting encoding…':
v.set_status('encoding_helper_statusbar', 'Detecting encoding…')
elif encoding_converted != None and encoding_converted:
v.set_status('encoding_helper_statusbar', "Converted to UTF-8 from "+encoding_normalize_for_display(encoding_converted))
elif encoding_sublime != 'Loading…' and encoding_encohelp != '' and encoding_encohelp != 'Unknown' and encoding_encohelp != 'Detecting encoding…' and encoding_normalize_for_comparation(encoding_sublime) != encoding_normalize_for_comparation(encoding_encohelp):
v.set_status('encoding_helper_statusbar', 'Opened as '+encoding_normalize_for_display(encoding_sublime)+', detected '+encoding_normalize_for_display(encoding_encohelp)+' (document maybe broken)')
elif encoding_sublime != 'Loading…' :
v.set_status('encoding_helper_statusbar', encoding_normalize_for_display(encoding_sublime) if not 'UTF-8' else '')
else:
v.set_status('encoding_helper_statusbar', encoding_normalize_for_display(encoding_encohelp) if not 'UTF-8' else '')
# sublime may knows the encoding of the loaded file at on_load time
def on_load(self, v):
if not v.settings().get('is_widget'):
self.on_encodings_detected(v);
def on_post_save_async(self, v):
if not v.settings().get('is_widget'):
v.settings().erase('encoding_helper_converted')
v.settings().erase('encoding_helper_encoding')
self.on_load_async(v);
def on_activated_async(self, v):
if not v.settings().get('is_widget'):
#v.settings().erase('encoding_helper_encoding')
self.on_load_async(v);
# try to guess the encoding
def on_load_async(self, v):
if not v or v.settings().get('is_widget'):
return
#has cached state?
if v.settings().has('encoding_helper_encoding'):
self.on_encodings_detected(v);
else:
# if the file is not there, just give up
file_name = v.file_name()
if not file_name or file_name == '' or os.path.isfile(file_name) == False:
v.settings().set('encoding_helper_encoding', '')
self.on_encodings_detected(v);
#guess
else:
v.set_status('encoding_helper_statusbar', 'Detecting encoding…');
confidence = 0
size = os.stat(file_name).st_size
if BINARY.search(file_name):
encoding = 'BINARY'
confidence = 1
elif size > 1048576 and maybe_binary(file_name):
encoding = 'BINARY'
confidence = 0.7
elif maybe_binary(file_name):
encoding = 'BINARY'
confidence = 0.7
elif size > 1048576: # skip files > 1Mb
encoding = 'Unknown'
confidence = 1
else:
fallback_processed = False
fallback = False
encoding = ''
if size < 666:
fallback = test_fallback_encodings(file_name)
fallback_processed = True
if fallback != False:
encoding = fallback
if not encoding:
fallback = test_fallback_encodings(file_name, ["UTF-8"])
if fallback != False:
encoding = fallback
if not encoding:
detector = UniversalDetector()
fp = open(file_name, 'rb')
detector.feed(fp.read())
fp.close()
detector.close()
if detector.done:
encoding = str(detector.result['encoding']).upper()
confidence = detector.result['confidence']
else:
encoding = 'Unknown'
confidence = 1
del detector
if encoding == None or encoding == 'NONE' or encoding == '' or encoding == 'Unknown' or confidence < 0.7:
if not fallback_processed:
fallback = test_fallback_encodings(file_name)
if fallback != False:
encoding = fallback
if v:
if encoding == 'ASCII':
encoding = 'UTF-8'
v.settings().set('encoding_helper_encoding', encoding)
self.on_encodings_detected(v);
if encoding != '' and encoding != 'UTF-8' and encoding in Pref.open_automatically_as_utf8 and v.is_dirty() == False:
v.set_status('encoding_helper_statusbar', 'Converting to '+encoding_normalize_for_display(encoding)+'…');
ConvertToUTF8(v, file_name, encoding).start()
class Toutf8fromBestGuessCommand(sublime_plugin.WindowCommand):
def run(self):
encoding = sublime.active_window().active_view().settings().get('encoding_helper_encoding')
if encoding != None and encoding != 'UTF-8' and encoding != 'BINARY' and encoding != 'Unknown' and encoding != '' and encoding != 'Detecting encoding…':
Toutf8fromCommand(sublime_plugin.WindowCommand).run(encoding)
def description(self):
try:
encoding = sublime.active_window().active_view().settings().get('encoding_helper_encoding')
if encoding != None and encoding != 'UTF-8' and encoding != 'BINARY' and encoding != 'Unknown' and encoding != '' and encoding != 'Detecting encoding…':
return 'Convert to UTF-8 From '+encoding
else:
return 'Convert to UTF-8 From Best Guess'
except:
return 'Convert to UTF-8 From Best Guess'
def is_enabled(self):
try:
encoding = sublime.active_window().active_view().settings().get('encoding_helper_encoding')
if encoding != None and encoding != 'UTF-8' and encoding != 'BINARY' and encoding != 'Unknown' and encoding != '' and encoding != 'Detecting encoding…':
return True
else:
return False
except:
return False
class Toutf8fromCommand(sublime_plugin.WindowCommand):
def run(self, encoding = ''):
try:
if encoding == None or encoding == 'UTF-8' or encoding == 'BINARY' or encoding == 'Unknown' or encoding == '' or encoding == 'Detecting encoding…':
return False
v = sublime.active_window().active_view()
file_name = v.file_name()
if not file_name or file_name == '' or os.path.isfile(file_name) == False:
return False
else:
ConvertToUTF8(v, file_name, encoding).start()
return True
except:
return False
def is_enabled(self, encoding = ''):
try:
file_name = sublime.active_window().active_view().file_name()
if not file_name or file_name == '' or os.path.isfile(file_name) == False:
return False
else:
return True
except:
return False
class ConvertToUTF8(threading.Thread):
def __init__(self, v, file_name, encoding, callback = False):
threading.Thread.__init__(self)
self.file_name = file_name
self.encoding = encoding
self.v = v
if callback == False:
self.callback = self.on_done
else:
self.callback = callback
def run(self):
_encoding = self.encoding.lower()
try:
__encoding = codecs.lookup(_encoding).name
except:
__encoding = _encoding;
try:
content = open(self.file_name, "r", encoding=__encoding, errors='strict', newline='').read()
if len(content) != 0:
sublime.set_timeout(lambda:self.callback(content, self.encoding), 0)
except LookupError:
sublime.set_timeout(lambda:self.on_lookup_error(self.file_name, self.encoding), 0)
except:
sublime.set_timeout(lambda:self.on_error(self.file_name, self.encoding), 0)
def on_done(self, content, encoding):
if self.v:
write_to_view(self.v, content);
self.v.settings().set('encoding_helper_converted', encoding)
self.v.settings().set('encoding_helper_encoding', 'UTF-8')
self.v.set_encoding('UTF-8');
EncodingOnStatusBarListener().on_encodings_detected(self.v);
def on_error(self, file_name, encoding):
self.v.settings().set('encoding_helper_encoding', encoding)
EncodingOnStatusBarListener().on_encodings_detected(self.v);
sublime.error_message('Unable to convert to UTF-8 from encoding "'+encoding+'" the file: \n'+file_name);
def on_lookup_error(self, file_name, encoding):
self.v.settings().set('encoding_helper_encoding', encoding)
EncodingOnStatusBarListener().on_encodings_detected(self.v);
sublime.error_message('The encoding "'+encoding+'" is unknown in this system.\n Unable to convert to UTF-8 the file: \n'+file_name);
def maybe_binary(file_name):
fp = open(file_name, 'rb')
line = fp.read(500);
read = 500
null_char = '\x00'.encode();
while line != '':
if null_char in line:
fp.close()
return True
read += 8000
if read > 1048576:
fp.close()
return False
line = fp.read(8000)
fp.close()
return False
def test_fallback_encodings(file_name, encodings = False):
if encodings == False:
encodings = Pref.fallback_encodings
for encoding in encodings:
_encoding = encoding.lower()
try:
fp = codecs.open(file_name, "rb", _encoding, errors='strict')
content = fp.read();
fp.close()
return encoding
except UnicodeDecodeError:
fp.close()
return False
def write_to_view(view, content):
view.run_command('encoding_helper_write_to_view', {"content": content});
class EncodingHelperWriteToViewCommand(sublime_plugin.TextCommand):
def run(self, edit, content):
view = self.view
view.replace(edit, sublime.Region(0, view.size()), content);
view.sel().clear()
view.sel().add(sublime.Region(0))
view.end_edit(edit)
def encoding_normalize_for_display(encoding):
if '(' in encoding:
encoding = encoding.split('(')[1]
if encoding != 'Hexadecimal' and encoding != 'BINARY' and encoding[:3] != 'UTF':
try:
encoding = codecs.lookup(encoding).name;
except:
pass
return encoding.lower().strip().replace(')', '').replace(' ', '-').replace('_', '-').strip().upper().replace('HEXADECIMAL', 'Hexadecimal').replace('-WITH-BOM', ' with BOM').replace('-LE', ' LE').replace('-BE', ' BE').replace('ISO', 'ISO-').replace('CP', 'CP-');
def encoding_normalize_for_comparation(encoding):
if '(' in encoding:
encoding = encoding.split('(')[1]
if encoding != 'Hexadecimal' and encoding != 'BINARY' and encoding[:3] != 'UTF':
try:
encoding = codecs.lookup(encoding).name;
except:
pass
return encoding.strip().upper().replace(' ', '-').replace('_', '-').replace('-WITH-BOM', '').replace('-LE', '').replace('-BE', '').replace(')', '').replace('-', '').replace('_', '').strip();