doublecmd/src/uconvencoding.pas

{
   Double Commander
   -------------------------------------------------------------------------
   Encoding conversion and related stuff

   Copyright (C) 2011-2016 Alexander Koblov (alexx2000@mail.ru)

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License along
   with this program; if not, write to the Free Software Foundation, Inc.,
   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
}

unit uConvEncoding;

{$mode delphi}

interface

type
  TMacroEncoding = (meOEM, meANSI, meUTF8, meUTF8BOM, meUTF16LE, meUTF16BE);

function TextIsASCII(const S: String): Boolean;
function DetectEncoding(const S: String): String; overload;
function SingleByteEncoding(TextEncoding: String): Boolean;
function DetectEncoding(const S: String; ADefault: TMacroEncoding; AStrict: Boolean): TMacroEncoding; overload;

implementation

uses
  SysUtils, Classes, LazUTF8, LConvEncoding, GetText,
  nsCore, nsUniversalDetector;

var
  Lang, FallbackLang: AnsiString;
  SupportedEncodings: TStringList = nil;

type
  TMyCodePages = (cp1251, cpKOI8R, cp866);

const
  scCodePage  : array[TMyCodePages] of AnsiString =
    (
     // CP1251 (WINDOWS)
     #$C0#$E0 + // Аа
     #$C1#$E1 + // Бб
     #$C2#$E2 + // Вв
     #$C3#$E3 + // Гг
     #$C4#$E4 + // Дд
     #$C5#$E5 + // Ее
     #$A8#$B8 + // Ёё
     #$C6#$E6 + // Жж
     #$C7#$E7 + // Зз
     #$C8#$E8 + // Ии
     #$C9#$E9 + // Йй
     #$CA#$EA + // Кк
     #$CB#$EB + // Лл
     #$CC#$EC + // Мм
     #$CD#$ED + // Нн
     #$CE#$EE + // Оо
     #$CF#$EF + // Пп
     #$D0#$F0 + // Рр
     #$D1#$F1 + // Сс
     #$D2#$F2 + // Тт
     #$D3#$F3 + // Уу
     #$D4#$F4 + // Фф
     #$D5#$F5 + // Хх
     #$D6#$F6 + // Цц
     #$D7#$F7 + // Чч
     #$D8#$F8 + // Шш
     #$D9#$F9 + // Щщ
     #$DA#$FA + // Ъъ
     #$DB#$FB + // Ыы
     #$DC#$FC + // Ьь
     #$DD#$FD + // Ээ
     #$DE#$FE + // Юю
     #$DF#$FF , // Яя
     // KOI8-R (UNIX)
     #$E1#$C1 + // Аа
     #$E2#$C2 + // Бб
     #$F7#$D7 + // Вв
     #$E7#$C7 + // Гг
     #$E4#$C4 + // Дд
     #$E5#$C5 + // Ее
     #$B3#$A3 + // Ёё
     #$F6#$D6 + // Жж
     #$FA#$DA + // Зз
     #$E9#$C9 + // Ии
     #$EA#$CA + // Йй
     #$EB#$CB + // Кк
     #$EC#$CC + // Лл
     #$ED#$CD + // Мм
     #$EE#$CE + // Нн
     #$EF#$CF + // Оо
     #$F0#$D0 + // Пп
     #$F2#$D2 + // Рр
     #$F3#$D3 + // Сс
     #$F4#$D4 + // Тт
     #$F5#$D5 + // Уу
     #$E6#$C6 + // Фф
     #$E8#$C8 + // Хх
     #$E3#$C3 + // Цц
     #$FE#$DE + // Чч
     #$FB#$DB + // Шш
     #$FD#$DD + // Щщ
     #$FF#$DF + // Ъъ
     #$F9#$D9 + // Ыы
     #$F8#$D8 + // Ьь
     #$FC#$DC + // Ээ
     #$E0#$C0 + // Юю
     #$F1#$D1 , // Яя
     // CP866 (DOS)
     #$80#$A0 + // Аа
     #$81#$A1 + // Бб
     #$82#$A2 + // Вв
     #$83#$A3 + // Гг
     #$84#$A4 + // Дд
     #$85#$A5 + // Ее
     #$F0#$F1 + // Ёё
     #$86#$A6 + // Жж
     #$87#$A7 + // Зз
     #$88#$A8 + // Ии
     #$89#$A9 + // Йй
     #$8A#$AA + // Кк
     #$8B#$AB + // Лл
     #$8C#$AC + // Мм
     #$8D#$AD + // Нн
     #$8E#$AE + // Оо
     #$8F#$AF + // Пп
     #$90#$E0 + // Рр
     #$91#$E1 + // Сс
     #$92#$E2 + // Тт
     #$93#$E3 + // Уу
     #$94#$E4 + // Фф
     #$95#$E5 + // Хх
     #$96#$E6 + // Цц
     #$97#$E7 + // Чч
     #$98#$E8 + // Шш
     #$99#$E9 + // Щщ
     #$9A#$EA + // Ъъ
     #$9B#$EB + // Ыы
     #$9C#$EC + // Ьь
     #$9D#$ED + // Ээ
     #$9E#$EE + // Юю
     #$9F#$EF   // Яя
    );

var
  svStatistic : array[AnsiChar] of Single;

procedure InitStatistic;
begin
  FillChar(svStatistic, SizeOf(svStatistic), 0);
  // CP1251 (WINDOWS)
  svStatistic[#$C0] := 0.001; // 'А'
  svStatistic[#$C1] := 0;     // 'Б'
  svStatistic[#$C2] := 0.002; // 'В'
  svStatistic[#$C3] := 0;     // 'Г'
  svStatistic[#$C4] := 0.001; // 'Д'
  svStatistic[#$C5] := 0.001; // 'Е'
  svStatistic[#$C6] := 0;     // 'Ж'
  svStatistic[#$C7] := 0;     // 'З'
  svStatistic[#$C8] := 0.001; // 'И'
  svStatistic[#$C9] := 0;     // 'Й'
  svStatistic[#$CA] := 0.001; // 'К'
  svStatistic[#$CB] := 0;     // 'Л'
  svStatistic[#$CC] := 0.001; // 'М'
  svStatistic[#$CD] := 0.001; // 'Н'
  svStatistic[#$CE] := 0.001; // 'О'
  svStatistic[#$CF] := 0.002; // 'П'
  svStatistic[#$D0] := 0.002; // 'Р'
  svStatistic[#$D1] := 0.001; // 'С'
  svStatistic[#$D2] := 0.001; // 'Т'
  svStatistic[#$D3] := 0;     // 'У'
  svStatistic[#$D4] := 0;     // 'Ф'
  svStatistic[#$D5] := 0;     // 'Х'
  svStatistic[#$D6] := 0;     // 'Ц'
  svStatistic[#$D7] := 0.001; // 'Ч'
  svStatistic[#$D8] := 0.001; // 'Ш'
  svStatistic[#$D9] := 0;     // 'Щ'
  svStatistic[#$DA] := 0;     // 'Ъ'
  svStatistic[#$DB] := 0;     // 'Ы'
  svStatistic[#$DC] := 0;     // 'Ь'
  svStatistic[#$DD] := 0.001; // 'Э'
  svStatistic[#$DE] := 0;     // 'Ю'
  svStatistic[#$DF] := 0;     // 'Я'
  svStatistic[#$E0] := 0.057; // 'а'
  svStatistic[#$E1] := 0.01;  // 'б'
  svStatistic[#$E2] := 0.031; // 'в'
  svStatistic[#$E3] := 0.011; // 'г'
  svStatistic[#$E4] := 0.021; // 'д'
  svStatistic[#$E5] := 0.067; // 'е'
  svStatistic[#$E6] := 0.007; // 'ж'
  svStatistic[#$E7] := 0.013; // 'з'
  svStatistic[#$E8] := 0.052; // 'и'
  svStatistic[#$E9] := 0.011; // 'й'
  svStatistic[#$EA] := 0.023; // 'к'
  svStatistic[#$EB] := 0.03;  // 'л'
  svStatistic[#$EC] := 0.024; // 'м'
  svStatistic[#$ED] := 0.043; // 'н'
  svStatistic[#$EE] := 0.075; // 'о'
  svStatistic[#$EF] := 0.026; // 'п'
  svStatistic[#$F0] := 0.038; // 'р'
  svStatistic[#$F1] := 0.034; // 'с'
  svStatistic[#$F2] := 0.046; // 'т'
  svStatistic[#$F3] := 0.016; // 'у'
  svStatistic[#$F4] := 0.001; // 'ф'
  svStatistic[#$F5] := 0.006; // 'х'
  svStatistic[#$F6] := 0.002; // 'ц'
  svStatistic[#$F7] := 0.011; // 'ч'
  svStatistic[#$F8] := 0.004; // 'ш'
  svStatistic[#$F9] := 0.004; // 'щ'
  svStatistic[#$FA] := 0;     // 'ъ'
  svStatistic[#$FB] := 0.012; // 'ы'
  svStatistic[#$FC] := 0.012; // 'ь'
  svStatistic[#$FD] := 0.003; // 'э'
  svStatistic[#$FE] := 0.005; // 'ю'
  svStatistic[#$FF] := 0.015; // 'я'
end;

function MyConvertString(const S: AnsiString; const FromCP, ToCP: TMyCodePages): AnsiString;
var
  I: Integer;
  C: AnsiChar;
  Chars: array [AnsiChar] of AnsiChar;
begin
  Result:= S;

  if FromCP = ToCP then Exit;

  for C := #0 to #255 do
    Chars[C] := C;

  for I := 1 to Length(scCodePage[cp1251]) do
    Chars[scCodePage[FromCP][I]] := scCodePage[ToCP][I];

  for I := 1 to Length(s) do
    Result[I] := Chars[Result[I]];
end;

function DetectCharsetCyrillic(const S: AnsiString): AnsiString;
var
  I: Integer;
  J: LongWord;
  C: AnsiChar;
  D, M: Single;
  T: AnsiString;
  CodePage: TMyCodePages;
  CharCount: array [AnsiChar] of Integer;
begin
  J := 0;
  M := 0;
  T := S;
  FillChar(CharCount, SizeOf(CharCount), 0);
  for I := 1 to Length(S) do
    Inc(CharCount[S[I]]);
  // Check for CP866 encoding
  for C := #$80 {'А'} to #$AF {'п'} do
    Inc(J, CharCount[C]);
  if J > (Length(S) div 3) then
  begin
    Result := 'CP866';
    Exit;
  end;
  for C := #$C0 {'А'} to #$FF {'я'} do
    M := M + sqr(CharCount[C] / Length(S) - svStatistic[C]);
  for CodePage := Low(TMyCodePages) to High(TMyCodePages) do
  begin
    // Convert to cp1251, because statistic in this encoding
    T:= MyConvertString(S, CodePage, cp1251);

    FillChar(CharCount, SizeOf(CharCount), 0);
    for I := 1 to Length(T) do
      Inc(CharCount[T[I]]);
    D := 0;
    for C := #$C0 {'А'} to #$FF {'я'} do
      D := D + sqr(CharCount[C] / Length(S) - svStatistic[C]);
    if D <= M then
    begin
      M := D;
      case CodePage of
        cp1251 : Result:= 'CP1251';
        cpKOI8R: Result:= 'KOI-8';
        cp866  : Result:= 'CP866';
      end;
    end;
  end;
end;

function MyDetectCodePageType(const S: AnsiString): AnsiString;
var
  Detector: TnsUniversalDetector = nil;
  CharsetInfo: rCharsetInfo;
begin
  Detector:= TnsUniversalDetector.Create;
  try
    Detector.Reset;
    Detector.HandleData(PChar(S), Length(S));
    if not Detector.Done then Detector.DataEnd;
    CharsetInfo:= Detector.GetDetectedCharsetInfo;
    case CharsetInfo.CodePage of
        866: Result:= 'CP866';
        932: Result:= 'CP932';
        950: Result:= 'CP950';
       1251: Result:= 'CP1251';
       1252: Result:= 'CP1252';
       1253: Result:= 'CP1253';
       1255: Result:= 'CP1255';
      20866: Result:= 'KOI-8';
      else
        begin
          Result:= CharsetInfo.Name;
          // When unknown encoding then use system encoding
          if SupportedEncodings.IndexOf(Result) < 0 then
          begin
            if (FallbackLang = 'be') or (FallbackLang = 'bg') or
               (FallbackLang = 'ky') or (FallbackLang = 'mk') or
               (FallbackLang = 'mn') or (FallbackLang = 'ru') or
               (FallbackLang = 'tt') then
              Result:= DetectCharsetCyrillic(S)
            else
              begin
                Result:= GetDefaultTextEncoding;
                if NormalizeEncoding(Result) = EncodingUTF8 then begin
                  // the system encoding is UTF-8, but it is not UTF-8
                  // use ISO-8859-1 instead. This encoding has a full 1:1 mapping to unicode,
                  // so no character is lost during conversions.
                  Result:= 'ISO-8859-1';
                end;
              end;
          end;
        end;
    end;
  finally
    FreeAndNil(Detector);
  end;
end;

function DetectEncoding(const S: String): String;

  function CompareI(p1, p2: PChar; Count: integer): boolean;
  var
    i: Integer;
    Chr1: Byte;
    Chr2: Byte;
  begin
    for i:=1 to Count do begin
      Chr1 := byte(p1^);
      Chr2 := byte(p2^);
      if Chr1<>Chr2 then begin
        if Chr1 in [97..122] then
          dec(Chr1,32);
        if Chr2 in [97..122] then
          dec(Chr2,32);
        if Chr1<>Chr2 then exit(false);
      end;
      inc(p1);
      inc(p2);
    end;
    Result:=true;
  end;

var
  L, P: Integer;
  EndPos: Integer;
begin
  L:= Length(S);
  if L = 0 then begin
    Result:= GetDefaultTextEncoding;
    Exit;
  end;

  // Try detect Unicode
  case DetectEncoding(S, meOEM, False) of
    meUTF8:    Exit(EncodingUTF8);
    meUTF8BOM: Exit(EncodingUTF8BOM);
    meUTF16LE: Exit(EncodingUCS2LE);
    meUTF16BE: Exit(EncodingUCS2BE);
  end;

  // Try {%encoding eee}
  if (L >= 11) and CompareI(@S[1], '{%encoding ', 11) then
  begin
    P:= 12;
    while (P <= L) and (S[P] in [' ', #9]) do Inc(P);
    EndPos:= P;
    while (EndPos <= L) and (not (S[EndPos] in ['}', ' ', #9])) do Inc(EndPos);
    Result:= NormalizeEncoding(Copy(S, P, EndPos - P));
    Exit;
  end;

  // Try to detect encoding
  Result:= MyDetectCodePageType(S);
end;

function SingleByteEncoding(TextEncoding: String): Boolean;
begin
  TextEncoding := NormalizeEncoding(TextEncoding);
  Result := (TextEncoding <> EncodingUTF8) and (TextEncoding <> EncodingUTF8BOM) and
            (TextEncoding <> EncodingUCS2LE) and (TextEncoding <> EncodingUCS2BE);
end;

function DetectEncoding(const S: String; ADefault: TMacroEncoding;
  AStrict: Boolean): TMacroEncoding;
var
  L, P, I: Integer;
begin
  L:= Length(S);
  if L = 0 then Exit(ADefault);

  // Try UTF-8 BOM (Byte Order Mark)
  if (L >= 3) and (S[1] = #$EF) and (S[2] = #$BB ) and (S[3] = #$BF) then
  begin
    Result:= meUTF8BOM;
    Exit;
  end;

  // Try ucs-2le BOM FF FE
  if (L >= 2) and (S[1] = #$FF) and (S[2] = #$FE) then
  begin
    Result:= meUTF16LE;
    Exit;
  end;

  // Try ucs-2be BOM FE FF
  if (L >= 2) and (S[1] = #$FE) and (S[2] = #$FF) then
  begin
    Result:= meUTF16BE;
    Exit;
  end;

  // Try UTF-8 (this includes ASCII)
  P:= 1;
  I:= Ord(not AStrict);
  while (P <= L) do
  begin
    if Ord(S[P]) < 128 then
    begin
      // ASCII
      Inc(P);
    end
    else begin
      I:= UTF8CharacterStrictLength(@S[P]);
      if I = 0 then Exit(ADefault);
      Inc(P, I);
    end;
  end;
  if I <> 0 then
    Result:= meUTF8
  else begin
    Result:= ADefault;
  end;
end;

function TextIsASCII(const S: String): Boolean; inline;
var
  I: Integer;
begin
  for I:= 1 to Length(S) do
  begin
    if Ord(S[I]) > 127 then
      Exit(False);
  end;
  Result:= True;
end;

initialization
  InitStatistic;
  GetLanguageIDs(Lang, FallbackLang);
  SupportedEncodings:= TStringList.Create;
  GetSupportedEncodings(SupportedEncodings);

finalization
  FreeAndNil(SupportedEncodings);

end.