doublecmd/components/doublecmd/dcconvertencoding.pas
2024-07-06 13:20:43 +03:00

651 lines
17 KiB
ObjectPascal

unit DCConvertEncoding;
{$mode objfpc}{$H+}
{$IF DEFINED(DARWIN)}
{$modeswitch objectivec1}
{$ENDIF}
interface
uses
Classes, SysUtils;
{$IF NOT DECLARED(RawByteString)}
type
RawByteString = AnsiString;
{$IFEND}
var
{en
Convert from OEM to System encoding, if needed
}
CeOemToSys: function (const Source: String): RawByteString;
CeSysToOem: function (const Source: String): RawByteString;
{en
Convert from OEM to UTF-8 encoding, if needed
}
CeOemToUtf8: function (const Source: String): RawByteString;
CeUtf8ToOem: function (const Source: String): RawByteString;
{en
Convert from Ansi to System encoding, if needed
}
CeAnsiToSys: function (const Source: String): RawByteString;
CeSysToAnsi: function (const Source: String): RawByteString;
{en
Convert from ANSI to UTF-8 encoding, if needed
}
CeAnsiToUtf8: function (const Source: String): RawByteString;
CeUtf8ToAnsi: function (const Source: String): RawByteString;
{en
Convert from Utf8 to System encoding, if needed
}
CeUtf8ToSys: function (const Source: String): RawByteString;
CeSysToUtf8: function (const Source: String): RawByteString;
function CeRawToUtf8(const Source: String): RawByteString;
function CeUtf8ToUtf16(const Source: String): UnicodeString;
function CeUtf16ToUtf8(const Source: UnicodeString): RawByteString;
{$IF DEFINED(MSWINDOWS)}
function CeTryEncode(const aValue: UnicodeString; aCodePage: Cardinal;
aAllowBestFit: Boolean; out aResult: AnsiString): Boolean;
function CeTryDecode(const aValue: AnsiString; aCodePage: Cardinal;
out aResult: UnicodeString): Boolean;
{$ELSEIF DEFINED(UNIX)}
var
SystemEncodingUtf8: Boolean = False;
SystemEncoding, SystemLocale: String;
{$ENDIF}
var
SystemLanguage: String;
implementation
uses
{$IF DEFINED(UNIX)}
LazUTF8
{$IF DEFINED(DARWIN)}
, dc_iconvenc_dyn, MacOSAll, CocoaAll, StrUtils
{$ELSE}
, iconvenc_dyn, UnixCP
{$ENDIF}
{$ELSEIF DEFINED(MSWINDOWS)}
Windows
{$ENDIF}
;
{$IF DEFINED(FPC_HAS_CPSTRING)}
var
FileSystemCodePage: TSystemCodePage;
{$ENDIF}
function UTF8CharacterStrictLength(P: PAnsiChar): integer;
begin
if p=nil then exit(0);
if ord(p^)<%10000000 then begin
// regular single byte character
exit(1);
end
else if ord(p^)<%11000000 then begin
// invalid single byte character
exit(0);
end
else if ((ord(p^) and %11100000) = %11000000) then begin
// should be 2 byte character
if (ord(p[1]) and %11000000) = %10000000 then
exit(2)
else
exit(0);
end
else if ((ord(p^) and %11110000) = %11100000) then begin
// should be 3 byte character
if ((ord(p[1]) and %11000000) = %10000000)
and ((ord(p[2]) and %11000000) = %10000000) then
exit(3)
else
exit(0);
end
else if ((ord(p^) and %11111000) = %11110000) then begin
// should be 4 byte character
if ((ord(p[1]) and %11000000) = %10000000)
and ((ord(p[2]) and %11000000) = %10000000)
and ((ord(p[3]) and %11000000) = %10000000) then
exit(4)
else
exit(0);
end else
exit(0);
end;
function CeRawToUtf8(const Source: String): RawByteString;
var
P: PAnsiChar;
I, L: LongInt;
begin
L:= Length(Source);
// Try UTF-8 (this includes ASCII)
P:= PAnsiChar(Source);
repeat
if Ord(P^) < 128 then begin
// ASCII
if (P^ = #0) and (P - PAnsiChar(Source) >= L) then begin
Result:= Source;
Exit;
end;
Inc(P);
end else begin
I:= UTF8CharacterStrictLength(P);
if I = 0 then Break;
Inc(P, I);
end;
until False;
Result:= CeSysToUtf8(Source);
end;
function CeUtf8ToUtf16(const Source: String): UnicodeString;
{$IF DEFINED(MSWINDOWS)}
var
L: SizeUInt;
begin
L:= Length(Source);
if L = 0 then Exit('');
SetLength(Result, L + 1);
// wide chars of UTF-16 <= bytes of UTF-8 string
SetLength(Result, MultiByteToWideChar(CP_UTF8, 0, PAnsiChar(Source), L, PWideChar(Result), L + 1));
end;
{$ELSE}
var
L: SizeUInt;
begin
L:= Length(Source);
if L = 0 then Exit('');
SetLength(Result, L + 1);
if (ConvertUTF8ToUTF16(PUnicodeChar(Result), L + 1, PAnsiChar(Source), L,
[toInvalidCharToSymbol], L) = trNoError) then
begin
SetLength(Result, L - 1);
end
else begin
SetLength(Result, 0);
end;
end;
{$ENDIF}
function CeUtf16ToUtf8(const Source: UnicodeString): RawByteString;
{$IF DEFINED(MSWINDOWS)}
var
L: SizeUInt;
begin
L:= Length(Source);
if (L = 0) then Exit('');
SetLength(Result, L * 3);
// bytes of UTF-8 <= 3 * wide chars of UTF-16 string
// e.g. %11100000 10100000 10000000 (UTF-8) is $0800 (UTF-16)
SetLength(Result, WideCharToMultiByte(CP_UTF8, 0,
PWideChar(Source), L, PAnsiChar(Result), Length(Result), nil, nil));
end;
{$ELSE}
var
L: SizeUInt;
begin
L:= Length(Source);
if (L = 0) then Exit('');
SetLength(Result, L * 3);
if (ConvertUTF16ToUTF8(PAnsiChar(Result), Length(Result) + 1, PUnicodeChar(Source), L,
[toInvalidCharToSymbol], L) = trNoError) then
begin
SetLength(Result, L - 1);
end
else begin
SetLength(Result, 0);
end;
end;
{$ENDIF}
function Dummy(const Source: String): RawByteString;
begin
Result:= Source;
end;
{$IF DEFINED(FPC_HAS_CPSTRING)}
function Sys2UTF8(const Source: String): RawByteString;
begin
Result:= Source;
SetCodePage(Result, FileSystemCodePage, False);
SetCodePage(Result, CP_UTF8, True);
// Prevent another codepage appear in the strings
// we don't need codepage conversion magic in our code
SetCodePage(Result, DefaultSystemCodePage, False);
end;
function UTF82Sys(const Source: String): RawByteString;
begin
Result:= Source;
SetCodePage(Result, CP_UTF8, False);
SetCodePage(Result, FileSystemCodePage, True);
// Prevent another codepage appear in the strings
// we don't need codepage conversion magic in our code
SetCodePage(Result, DefaultSystemCodePage, False);
end;
{$ELSE}
function Sys2UTF8(const Source: String): RawByteString;
begin
Result:= UTF8Encode(Source);
end;
function UTF82Sys(const Source: String): RawByteString;
begin
Result:= UTF8Decode(Source);
end;
{$ENDIF}
{$IF DEFINED(MSWINDOWS)}
function CeTryEncode(const aValue: UnicodeString; aCodePage: Cardinal;
aAllowBestFit: Boolean; out aResult: AnsiString): Boolean;
// Try to encode the given Unicode string as the requested codepage
const
WC_NO_BEST_FIT_CHARS = $00000400;
Flags: array[Boolean] of DWORD = (WC_NO_BEST_FIT_CHARS, 0);
var
UsedDefault: BOOL;
begin
if not aAllowBestFit and not CheckWin32Version(4, 1) then
Result := False
else begin
SetLength(aResult, WideCharToMultiByte(aCodePage, Flags[aAllowBestFit],
PWideChar(aValue), Length(aValue), nil, 0, nil, @UsedDefault));
SetLength(aResult, WideCharToMultiByte(aCodePage, Flags[aAllowBestFit],
PWideChar(aValue), Length(aValue), PAnsiChar(aResult),
Length(aResult), nil, @UsedDefault));
Result := not UsedDefault;
end;
end;
function CeTryDecode(const aValue: AnsiString; aCodePage: Cardinal;
out aResult: UnicodeString): Boolean;
begin
SetLength(aResult, MultiByteToWideChar(aCodePage, MB_ERR_INVALID_CHARS,
LPCSTR(aValue), Length(aValue), nil, 0) * SizeOf(UnicodeChar));
SetLength(aResult, MultiByteToWideChar(aCodePage, MB_ERR_INVALID_CHARS,
LPCSTR(aValue), Length(aValue), PWideChar(aResult), Length(aResult)));
Result := Length(aResult) > 0;
end;
function Oem2Utf8(const Source: String): RawByteString;
var
UnicodeResult: UnicodeString;
begin
if CeTryDecode(Source, CP_OEMCP, UnicodeResult) then
Result:= CeUtf16ToUtf8(UnicodeResult)
else
Result:= Source;
end;
function Utf82Oem(const Source: String): RawByteString;
var
AnsiResult: AnsiString;
begin
if CeTryEncode(CeUtf8ToUtf16(Source), CP_OEMCP, False, AnsiResult) then
Result:= AnsiResult
else
Result:= Source;
end;
function OEM2Ansi(const Source: String): RawByteString;
var
Dst: PAnsiChar;
begin
Result:= Source;
Dst:= AllocMem((Length(Result) + 1) * SizeOf(AnsiChar));
if OEMToChar(PAnsiChar(Result), Dst) then
Result:= StrPas(Dst);
FreeMem(Dst);
end;
function Ansi2OEM(const Source: String): RawByteString;
var
Dst: PAnsiChar;
begin
Result := Source;
Dst := AllocMem((Length(Result) + 1) * SizeOf(AnsiChar));
if CharToOEM(PAnsiChar(Result), Dst) then
Result := StrPas(Dst);
FreeMem(Dst);
end;
procedure Initialize;
var
Buffer: array[1..4] of AnsiChar;
begin
CeOemToSys:= @OEM2Ansi;
CeSysToOem:= @Ansi2OEM;
CeOemToUtf8:= @Oem2Utf8;
CeUtf8ToOem:= @Utf82Oem;
CeAnsiToSys:= @Dummy;
CeSysToAnsi:= @Dummy;
CeAnsiToUtf8:= @Sys2UTF8;
CeUtf8ToAnsi:= @UTF82Sys;
CeSysToUtf8:= @Sys2UTF8;
CeUtf8ToSys:= @UTF82Sys;
if GetLocaleInfo(GetUserDefaultLCID, LOCALE_SABBREVLANGNAME, @Buffer[1], 4) > 0 then
SystemLanguage := LowerCase(Copy(Buffer, 1, 2));
end;
{$ELSEIF DEFINED(UNIX)}
{$I dcconvertencoding.inc}
const
EncodingUTF8 = 'UTF-8'; // UTF-8 Encoding
var
EncodingOEM, // OEM Encoding
EncodingANSI: String; // ANSI Encoding
function GetSystemEncoding: Boolean;
{$IF DEFINED(DARWIN)}
var
Country: String;
CurrentLocale: NSLocale;
LanguageCFRef: CFStringRef = nil;
LanguageCFArray: CFArrayRef = nil;
begin
// System encoding
SystemEncoding:= EncodingUTF8;
// Get system language
LanguageCFArray:= CFLocaleCopyPreferredLanguages;
try
Result:= CFArrayGetCount(LanguageCFArray) > 0;
if Result then
begin
LanguageCFRef:= CFArrayGetValueAtIndex(LanguageCFArray, 0);
SetLength(SystemLanguage, MAX_PATH);
Result:= CFStringGetCString(LanguageCFRef,
PAnsiChar(SystemLanguage),
MAX_PATH,
kCFStringEncodingUTF8
);
end;
finally
CFRelease(LanguageCFArray);
end;
if Result then
begin
// Crop to terminating zero
SystemLanguage:= PAnsiChar(SystemLanguage);
SystemLanguage:= Copy2Symb(SystemLanguage, '-');
// Get system country
CurrentLocale:= NSLocale.currentLocale();
Country:= NSString(CurrentLocale.objectForKey(NSLocaleCountryCode)).UTF8String;
// Combine system locale
if (Length(SystemLanguage) > 0) and (Length(Country) > 0) then
begin
SystemLocale:= SystemLanguage + '_' + Country;
end;
end;
end;
{$ELSE}
var
I: Integer;
Lang: String;
begin
Result:= True;
Lang:= SysUtils.GetEnvironmentVariable('LC_ALL');
if Length(Lang) = 0 then
begin
Lang:= SysUtils.GetEnvironmentVariable('LC_CTYPE');
if Length(Lang) = 0 then
begin
Lang:= SysUtils.GetEnvironmentVariable('LANG');
if Length(Lang) = 0 then
Exit(False);
end;
end;
I:= Pos('_', Lang);
if (I = 0) then
SystemLanguage:= Lang
else begin
SystemLanguage:= Copy(Lang, 1, I - 1);
end;
I:= System.Pos('.', Lang);
if (I > 0) then
begin
SystemLocale:= Copy(Lang, 1, I - 1);
SystemEncoding:= Copy(Lang, I + 1, Length(Lang) - I);
end
else begin
SystemLocale:= Lang;
SystemEncoding:= EncodingUTF8;
end;
end;
{$ENDIF}
{$IF DEFINED(DARWIN)}
function InitIconv(var Error: String): Boolean;
begin
Error:= EmptyStr;
Result:= TryLoadLib('libiconv.dylib', Error);
IconvLibFound:= IconvLibFound or Result;
end;
{$ELSEIF DEFINED(FPC_HAS_CPSTRING)}
var
AManager : TUnicodeStringManager;
function GetStandardCodePage(const stdcp: TStandardCodePageEnum): TSystemCodePage;
begin
Result:= UnixCP.GetSystemCodepage;
end;
procedure SetStdIOCodePage(var T: Text); inline;
begin
case TextRec(T).Mode of
fmInput: TextRec(T).CodePage:= GetStandardCodePage(scpConsoleInput);
fmOutput: TextRec(T).CodePage:= GetStandardCodePage(scpConsoleOutput);
end;
end;
procedure SetStdIOCodePages; inline;
begin
SetStdIOCodePage(Input);
SetStdIOCodePage(Output);
SetStdIOCodePage(ErrOutput);
SetStdIOCodePage(StdOut);
SetStdIOCodePage(StdErr);
end;
{$ENDIF}
function FindEncoding: Boolean;
var
Index: Integer;
begin
// Try to find by language and country
for Index:= Low(charset_relation) to High(charset_relation) do
begin
if CompareStr(charset_relation[Index, 1], SystemLocale) = 0 then
begin
EncodingANSI:= charset_relation[Index, 2];
EncodingOEM:= charset_relation[Index, 3];
Exit(True);
end;
end;
// Try to find by language only
for Index:= Low(charset_relation) to High(charset_relation) do
begin
if CompareStr(charset_relation[Index, 0], SystemLanguage) = 0 then
begin
EncodingANSI:= charset_relation[Index, 2];
EncodingOEM:= charset_relation[Index, 3];
Exit(True);
end;
end;
Result:= False;
end;
function Oem2Utf8(const Source: String): RawByteString;
begin
Result:= Source;
Iconvert(Source, String(Result), EncodingOEM, EncodingUTF8);
end;
function Utf82Oem(const Source: String): RawByteString;
begin
Result:= Source;
Iconvert(Source, String(Result), EncodingUTF8, EncodingOEM);
end;
function OEM2Sys(const Source: String): RawByteString;
begin
Result:= Source;
Iconvert(Source, String(Result), EncodingOEM, SystemEncoding);
end;
function Sys2OEM(const Source: String): RawByteString;
begin
Result:= Source;
Iconvert(Source, String(Result), SystemEncoding, EncodingOEM);
end;
function Ansi2Sys(const Source: String): RawByteString;
begin
Result:= Source;
Iconvert(Source, String(Result), EncodingANSI, SystemEncoding);
end;
function Sys2Ansi(const Source: String): RawByteString;
begin
Result:= Source;
Iconvert(Source, String(Result), SystemEncoding, EncodingANSI);
end;
function Ansi2Utf8(const Source: String): RawByteString;
begin
Result:= Source;
Iconvert(Source, String(Result), EncodingANSI, EncodingUTF8);
end;
function Utf82Ansi(const Source: String): RawByteString;
begin
Result:= Source;
Iconvert(Source, String(Result), EncodingUTF8, EncodingANSI);
end;
procedure Initialize;
var
Error: String = '';
begin
CeOemToSys:= @Dummy;
CeSysToOem:= @Dummy;
CeOemToUtf8:= @Dummy;
CeUtf8ToOem:= @Dummy;
CeAnsiToSys:= @Dummy;
CeSysToAnsi:= @Dummy;
CeUtf8ToSys:= @Dummy;
CeSysToUtf8:= @Dummy;
CeAnsiToUtf8:= @Dummy;
CeUtf8ToAnsi:= @Dummy;
{$IF DEFINED(FPC_HAS_CPSTRING) and NOT DEFINED(DARWIN)}
{
If locale does not exists then nl_langinfo (called by cwstring unit)
returns ANSI_X3.4-1968 (CP_ASCII) as system encoding. Try to find correct
encoding by using environment variables LC_ALL, LC_CTYPE, LANG in this case.
}
if DefaultFileSystemCodePage = CP_ASCII then
begin
DefaultFileSystemCodePage:= UnixCP.GetSystemCodepage;
// Use CP_UTF8 if cannot determine system encoding
if DefaultFileSystemCodePage = CP_ASCII then
DefaultFileSystemCodePage:= CP_UTF8
else begin
GetWideStringManager(AManager);
AManager.GetStandardCodePageProc:= @GetStandardCodePage;
SetWideStringManager(AManager);
end;
SetStdIOCodePages;
FileSystemCodePage:= DefaultFileSystemCodePage;
DefaultSystemCodePage:= DefaultFileSystemCodePage;
DefaultRTLFileSystemCodePage:= DefaultFileSystemCodePage;
end;
{$ENDIF}
// Try to get system encoding and initialize Iconv library
if not (GetSystemEncoding and InitIconv(Error)) then
WriteLn(Error)
else
begin
SystemEncodingUtf8:= (SysUtils.CompareText(SystemEncoding, 'UTF-8') = 0) or
(SysUtils.CompareText(SystemEncoding, 'UTF8') = 0);
if FindEncoding then
begin
if (Length(EncodingOEM) > 0) then
begin
CeOemToSys:= @OEM2Sys;
CeSysToOem:= @Sys2OEM;
CeOemToUtf8:= @Oem2Utf8;
CeUtf8ToOem:= @Utf82Oem;
end;
if (Length(EncodingANSI) > 0) then
begin
CeAnsiToSys:= @Ansi2Sys;
CeSysToAnsi:= @Sys2Ansi;
CeAnsiToUtf8:= @Ansi2Utf8;
CeUtf8ToAnsi:= @Utf82Ansi;
end;
end;
if not SystemEncodingUtf8 then
begin
CeUtf8ToSys:= @UTF82Sys;
CeSysToUtf8:= @Sys2UTF8;
end;
end;
WriteLn('SystemLocale ', SystemLocale);
WriteLn('SystemLanguage ', SystemLanguage);
WriteLn('SystemEncoding ', SystemEncoding);
WriteLn('DefaultSystemCodePage ', DefaultSystemCodePage);
WriteLn('DefaultFileSystemCodePage ', DefaultFileSystemCodePage);
WriteLn('DefaultRTLFileSystemCodePage ', DefaultRTLFileSystemCodePage);
end;
{$ELSE}
procedure Initialize;
begin
CeOemToSys:= @Dummy;
CeSysToOem:= @Dummy;
CeOemToUtf8:= @Dummy;
CeUtf8ToOem:= @Dummy;
CeAnsiToSys:= @Dummy;
CeSysToAnsi:= @Dummy;
CeUtf8ToSys:= @Dummy;
CeSysToUtf8:= @Dummy;
CeAnsiToUtf8:= @Dummy;
CeUtf8ToAnsi:= @Dummy;
end;
{$ENDIF}
initialization
{$IF DEFINED(FPC_HAS_CPSTRING)}
{$IF DEFINED(MSWINDOWS)}
FileSystemCodePage:= Windows.GetACP;
{$ELSE}
FileSystemCodePage:= WideStringManager.GetStandardCodePageProc(scpFileSystemSingleByte);
{$ENDIF}
{$ENDIF}
Initialize;
end.