doublecmd/src/uconvencoding.pas
2016-06-12 18:58:37 +00:00

485 lines
13 KiB
ObjectPascal
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
Double Commander
-------------------------------------------------------------------------
Encoding conversion and related stuff
Copyright (C) 2011-2016 Alexander Koblov (alexx2000@mail.ru)
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
}
unit uConvEncoding;
{$mode delphi}
interface
type
TMacroEncoding = (meOEM, meANSI, meUTF8, meUTF8BOM, meUTF16LE, meUTF16BE);
function TextIsASCII(const S: String): Boolean;
function DetectEncoding(const S: String): String; overload;
function SingleByteEncoding(TextEncoding: String): Boolean;
function DetectEncoding(const S: String; ADefault: TMacroEncoding; AStrict: Boolean): TMacroEncoding; overload;
implementation
uses
SysUtils, Classes, LazUTF8, LConvEncoding, GetText,
nsCore, nsUniversalDetector;
var
Lang, FallbackLang: AnsiString;
SupportedEncodings: TStringList = nil;
type
TMyCodePages = (cp1251, cpKOI8R, cp866);
const
scCodePage : array[TMyCodePages] of AnsiString =
(
// CP1251 (WINDOWS)
#$C0#$E0 + // Аа
#$C1#$E1 + // Бб
#$C2#$E2 + // Вв
#$C3#$E3 + // Гг
#$C4#$E4 + // Дд
#$C5#$E5 + // Ее
#$A8#$B8 + // Ёё
#$C6#$E6 + // Жж
#$C7#$E7 + // Зз
#$C8#$E8 + // Ии
#$C9#$E9 + // Йй
#$CA#$EA + // Кк
#$CB#$EB + // Лл
#$CC#$EC + // Мм
#$CD#$ED + // Нн
#$CE#$EE + // Оо
#$CF#$EF + // Пп
#$D0#$F0 + // Рр
#$D1#$F1 + // Сс
#$D2#$F2 + // Тт
#$D3#$F3 + // Уу
#$D4#$F4 + // Фф
#$D5#$F5 + // Хх
#$D6#$F6 + // Цц
#$D7#$F7 + // Чч
#$D8#$F8 + // Шш
#$D9#$F9 + // Щщ
#$DA#$FA + // Ъъ
#$DB#$FB + // Ыы
#$DC#$FC + // Ьь
#$DD#$FD + // Ээ
#$DE#$FE + // Юю
#$DF#$FF , // Яя
// KOI8-R (UNIX)
#$E1#$C1 + // Аа
#$E2#$C2 + // Бб
#$F7#$D7 + // Вв
#$E7#$C7 + // Гг
#$E4#$C4 + // Дд
#$E5#$C5 + // Ее
#$B3#$A3 + // Ёё
#$F6#$D6 + // Жж
#$FA#$DA + // Зз
#$E9#$C9 + // Ии
#$EA#$CA + // Йй
#$EB#$CB + // Кк
#$EC#$CC + // Лл
#$ED#$CD + // Мм
#$EE#$CE + // Нн
#$EF#$CF + // Оо
#$F0#$D0 + // Пп
#$F2#$D2 + // Рр
#$F3#$D3 + // Сс
#$F4#$D4 + // Тт
#$F5#$D5 + // Уу
#$E6#$C6 + // Фф
#$E8#$C8 + // Хх
#$E3#$C3 + // Цц
#$FE#$DE + // Чч
#$FB#$DB + // Шш
#$FD#$DD + // Щщ
#$FF#$DF + // Ъъ
#$F9#$D9 + // Ыы
#$F8#$D8 + // Ьь
#$FC#$DC + // Ээ
#$E0#$C0 + // Юю
#$F1#$D1 , // Яя
// CP866 (DOS)
#$80#$A0 + // Аа
#$81#$A1 + // Бб
#$82#$A2 + // Вв
#$83#$A3 + // Гг
#$84#$A4 + // Дд
#$85#$A5 + // Ее
#$F0#$F1 + // Ёё
#$86#$A6 + // Жж
#$87#$A7 + // Зз
#$88#$A8 + // Ии
#$89#$A9 + // Йй
#$8A#$AA + // Кк
#$8B#$AB + // Лл
#$8C#$AC + // Мм
#$8D#$AD + // Нн
#$8E#$AE + // Оо
#$8F#$AF + // Пп
#$90#$E0 + // Рр
#$91#$E1 + // Сс
#$92#$E2 + // Тт
#$93#$E3 + // Уу
#$94#$E4 + // Фф
#$95#$E5 + // Хх
#$96#$E6 + // Цц
#$97#$E7 + // Чч
#$98#$E8 + // Шш
#$99#$E9 + // Щщ
#$9A#$EA + // Ъъ
#$9B#$EB + // Ыы
#$9C#$EC + // Ьь
#$9D#$ED + // Ээ
#$9E#$EE + // Юю
#$9F#$EF // Яя
);
var
svStatistic : array[AnsiChar] of Single;
procedure InitStatistic;
begin
FillChar(svStatistic, SizeOf(svStatistic), 0);
// CP1251 (WINDOWS)
svStatistic[#$C0] := 0.001; // 'А'
svStatistic[#$C1] := 0; // 'Б'
svStatistic[#$C2] := 0.002; // 'В'
svStatistic[#$C3] := 0; // 'Г'
svStatistic[#$C4] := 0.001; // 'Д'
svStatistic[#$C5] := 0.001; // 'Е'
svStatistic[#$C6] := 0; // 'Ж'
svStatistic[#$C7] := 0; // 'З'
svStatistic[#$C8] := 0.001; // 'И'
svStatistic[#$C9] := 0; // 'Й'
svStatistic[#$CA] := 0.001; // 'К'
svStatistic[#$CB] := 0; // 'Л'
svStatistic[#$CC] := 0.001; // 'М'
svStatistic[#$CD] := 0.001; // 'Н'
svStatistic[#$CE] := 0.001; // 'О'
svStatistic[#$CF] := 0.002; // 'П'
svStatistic[#$D0] := 0.002; // 'Р'
svStatistic[#$D1] := 0.001; // 'С'
svStatistic[#$D2] := 0.001; // 'Т'
svStatistic[#$D3] := 0; // 'У'
svStatistic[#$D4] := 0; // 'Ф'
svStatistic[#$D5] := 0; // 'Х'
svStatistic[#$D6] := 0; // 'Ц'
svStatistic[#$D7] := 0.001; // 'Ч'
svStatistic[#$D8] := 0.001; // 'Ш'
svStatistic[#$D9] := 0; // 'Щ'
svStatistic[#$DA] := 0; // 'Ъ'
svStatistic[#$DB] := 0; // 'Ы'
svStatistic[#$DC] := 0; // 'Ь'
svStatistic[#$DD] := 0.001; // 'Э'
svStatistic[#$DE] := 0; // 'Ю'
svStatistic[#$DF] := 0; // 'Я'
svStatistic[#$E0] := 0.057; // 'а'
svStatistic[#$E1] := 0.01; // 'б'
svStatistic[#$E2] := 0.031; // 'в'
svStatistic[#$E3] := 0.011; // 'г'
svStatistic[#$E4] := 0.021; // 'д'
svStatistic[#$E5] := 0.067; // 'е'
svStatistic[#$E6] := 0.007; // 'ж'
svStatistic[#$E7] := 0.013; // 'з'
svStatistic[#$E8] := 0.052; // 'и'
svStatistic[#$E9] := 0.011; // 'й'
svStatistic[#$EA] := 0.023; // 'к'
svStatistic[#$EB] := 0.03; // 'л'
svStatistic[#$EC] := 0.024; // 'м'
svStatistic[#$ED] := 0.043; // 'н'
svStatistic[#$EE] := 0.075; // 'о'
svStatistic[#$EF] := 0.026; // 'п'
svStatistic[#$F0] := 0.038; // 'р'
svStatistic[#$F1] := 0.034; // 'с'
svStatistic[#$F2] := 0.046; // 'т'
svStatistic[#$F3] := 0.016; // 'у'
svStatistic[#$F4] := 0.001; // 'ф'
svStatistic[#$F5] := 0.006; // 'х'
svStatistic[#$F6] := 0.002; // 'ц'
svStatistic[#$F7] := 0.011; // 'ч'
svStatistic[#$F8] := 0.004; // 'ш'
svStatistic[#$F9] := 0.004; // 'щ'
svStatistic[#$FA] := 0; // 'ъ'
svStatistic[#$FB] := 0.012; // 'ы'
svStatistic[#$FC] := 0.012; // 'ь'
svStatistic[#$FD] := 0.003; // 'э'
svStatistic[#$FE] := 0.005; // 'ю'
svStatistic[#$FF] := 0.015; // 'я'
end;
function MyConvertString(const S: AnsiString; const FromCP, ToCP: TMyCodePages): AnsiString;
var
I: Integer;
C: AnsiChar;
Chars: array [AnsiChar] of AnsiChar;
begin
Result:= S;
if FromCP = ToCP then Exit;
for C := #0 to #255 do
Chars[C] := C;
for I := 1 to Length(scCodePage[cp1251]) do
Chars[scCodePage[FromCP][I]] := scCodePage[ToCP][I];
for I := 1 to Length(s) do
Result[I] := Chars[Result[I]];
end;
function DetectCharsetCyrillic(const S: AnsiString): AnsiString;
var
I: Integer;
J: LongWord;
C: AnsiChar;
D, M: Single;
T: AnsiString;
CodePage: TMyCodePages;
CharCount: array [AnsiChar] of Integer;
begin
J := 0;
M := 0;
T := S;
FillChar(CharCount, SizeOf(CharCount), 0);
for I := 1 to Length(S) do
Inc(CharCount[S[I]]);
// Check for CP866 encoding
for C := #$80 {'А'} to #$AF {'п'} do
Inc(J, CharCount[C]);
if J > (Length(S) div 3) then
begin
Result := 'CP866';
Exit;
end;
for C := #$C0 {'А'} to #$FF {'я'} do
M := M + sqr(CharCount[C] / Length(S) - svStatistic[C]);
for CodePage := Low(TMyCodePages) to High(TMyCodePages) do
begin
// Convert to cp1251, because statistic in this encoding
T:= MyConvertString(S, CodePage, cp1251);
FillChar(CharCount, SizeOf(CharCount), 0);
for I := 1 to Length(T) do
Inc(CharCount[T[I]]);
D := 0;
for C := #$C0 {'А'} to #$FF {'я'} do
D := D + sqr(CharCount[C] / Length(S) - svStatistic[C]);
if D <= M then
begin
M := D;
case CodePage of
cp1251 : Result:= 'CP1251';
cpKOI8R: Result:= 'KOI-8';
cp866 : Result:= 'CP866';
end;
end;
end;
end;
function MyDetectCodePageType(const S: AnsiString): AnsiString;
var
Detector: TnsUniversalDetector = nil;
CharsetInfo: rCharsetInfo;
begin
Detector:= TnsUniversalDetector.Create;
try
Detector.Reset;
Detector.HandleData(PChar(S), Length(S));
if not Detector.Done then Detector.DataEnd;
CharsetInfo:= Detector.GetDetectedCharsetInfo;
case CharsetInfo.CodePage of
866: Result:= 'CP866';
932: Result:= 'CP932';
950: Result:= 'CP950';
1251: Result:= 'CP1251';
1252: Result:= 'CP1252';
1253: Result:= 'CP1253';
1255: Result:= 'CP1255';
20866: Result:= 'KOI-8';
else
begin
Result:= CharsetInfo.Name;
// When unknown encoding then use system encoding
if SupportedEncodings.IndexOf(Result) < 0 then
begin
if (FallbackLang = 'be') or (FallbackLang = 'bg') or
(FallbackLang = 'ky') or (FallbackLang = 'mk') or
(FallbackLang = 'mn') or (FallbackLang = 'ru') or
(FallbackLang = 'tt') then
Result:= DetectCharsetCyrillic(S)
else
begin
Result:= GetDefaultTextEncoding;
if NormalizeEncoding(Result) = EncodingUTF8 then begin
// the system encoding is UTF-8, but it is not UTF-8
// use ISO-8859-1 instead. This encoding has a full 1:1 mapping to unicode,
// so no character is lost during conversions.
Result:= 'ISO-8859-1';
end;
end;
end;
end;
end;
finally
FreeAndNil(Detector);
end;
end;
function DetectEncoding(const S: String): String;
function CompareI(p1, p2: PChar; Count: integer): boolean;
var
i: Integer;
Chr1: Byte;
Chr2: Byte;
begin
for i:=1 to Count do begin
Chr1 := byte(p1^);
Chr2 := byte(p2^);
if Chr1<>Chr2 then begin
if Chr1 in [97..122] then
dec(Chr1,32);
if Chr2 in [97..122] then
dec(Chr2,32);
if Chr1<>Chr2 then exit(false);
end;
inc(p1);
inc(p2);
end;
Result:=true;
end;
var
L, P: Integer;
EndPos: Integer;
begin
L:= Length(S);
if L = 0 then begin
Result:= GetDefaultTextEncoding;
Exit;
end;
// Try detect Unicode
case DetectEncoding(S, meOEM, False) of
meUTF8: Exit(EncodingUTF8);
meUTF8BOM: Exit(EncodingUTF8BOM);
meUTF16LE: Exit(EncodingUCS2LE);
meUTF16BE: Exit(EncodingUCS2BE);
end;
// Try {%encoding eee}
if (L >= 11) and CompareI(@S[1], '{%encoding ', 11) then
begin
P:= 12;
while (P <= L) and (S[P] in [' ', #9]) do Inc(P);
EndPos:= P;
while (EndPos <= L) and (not (S[EndPos] in ['}', ' ', #9])) do Inc(EndPos);
Result:= NormalizeEncoding(Copy(S, P, EndPos - P));
Exit;
end;
// Try to detect encoding
Result:= MyDetectCodePageType(S);
end;
function SingleByteEncoding(TextEncoding: String): Boolean;
begin
TextEncoding := NormalizeEncoding(TextEncoding);
Result := (TextEncoding <> EncodingUTF8) and (TextEncoding <> EncodingUTF8BOM) and
(TextEncoding <> EncodingUCS2LE) and (TextEncoding <> EncodingUCS2BE);
end;
function DetectEncoding(const S: String; ADefault: TMacroEncoding;
AStrict: Boolean): TMacroEncoding;
var
L, P, I: Integer;
begin
L:= Length(S);
if L = 0 then Exit(ADefault);
// Try UTF-8 BOM (Byte Order Mark)
if (L >= 3) and (S[1] = #$EF) and (S[2] = #$BB ) and (S[3] = #$BF) then
begin
Result:= meUTF8BOM;
Exit;
end;
// Try ucs-2le BOM FF FE
if (L >= 2) and (S[1] = #$FF) and (S[2] = #$FE) then
begin
Result:= meUTF16LE;
Exit;
end;
// Try ucs-2be BOM FE FF
if (L >= 2) and (S[1] = #$FE) and (S[2] = #$FF) then
begin
Result:= meUTF16BE;
Exit;
end;
// Try UTF-8 (this includes ASCII)
P:= 1;
I:= Ord(not AStrict);
while (P <= L) do
begin
if Ord(S[P]) < 128 then
begin
// ASCII
Inc(P);
end
else begin
I:= UTF8CharacterStrictLength(@S[P]);
if I = 0 then Exit(ADefault);
Inc(P, I);
end;
end;
if I <> 0 then
Result:= meUTF8
else begin
Result:= ADefault;
end;
end;
function TextIsASCII(const S: String): Boolean; inline;
var
I: Integer;
begin
for I:= 1 to Length(S) do
begin
if Ord(S[I]) > 127 then
Exit(False);
end;
Result:= True;
end;
initialization
InitStatistic;
GetLanguageIDs(Lang, FallbackLang);
SupportedEncodings:= TStringList.Create;
GetSupportedEncodings(SupportedEncodings);
finalization
FreeAndNil(SupportedEncodings);
end.