doublecmd/src/uaccentsutils.pas
2020-02-24 14:34:28 +00:00

172 lines
7.3 KiB
ObjectPascal

{
Double Commander
-------------------------------------------------------------------------
Routine related with characters with accents/ligatures and their equivalents without
Copyright (C) 2016 Alexander Koblov (alexx2000@mail.ru)
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
}
unit uAccentsUtils;
{$mode objfpc}{$H+}
interface
uses
Classes, SysUtils;
procedure LoadInMemoryOurAccentLookupTableList;
procedure FreeMemoryFromOurAccentLookupTableList;
function NormalizeAccentedChar(sInput: string): string;
function PosOfSubstrWithVersatileOptions(sSubString, sWholeString: string; bCaseSensitive, bIgnoreAccent: boolean; var ActualCharFittedInInput: integer): integer;
var
gslAccents, gslAccentsStripped: TStringList;
resourcestring
rsStrAccents = 'á;â;à;å;ã;ä;ç;é;ê;è;ë;í;î;ì;ï;ñ;ó;ô;ò;ø;õ;ö;ú;û;ù;ü;ÿ;Á;Â;À;Å;Ã;Ä;Ç;É;Ê;È;Ë;Í;Í;Ì;Ï;Ñ;Ó;Ô;Ø;Õ;Ö;ß;Ú;Û;Ù;Ü;Ÿ;¿;¡;œ;æ;Æ;Œ';
rsStrAccentsStripped = 'a;a;a;a;a;a;c;e;e;e;e;i;i;i;i;n;o;o;o;o;o;o;u;u;u;u;y;A;A;A;A;A;A;C;E;E;E;E;I;I;I;I;N;O;O;O;O;O;B;U;U;U;U;Y;?;!;oe;ae;AE;OE';
implementation
uses
//Lazarus, Free-Pascal, etc.
LazUTF8,
//DC
DCStrUtils;
{ LoadInMemoryOurAccentLookupTableList }
procedure LoadInMemoryOurAccentLookupTableList;
var
slTempoAccents, slTempoAccentsStripped: TStringList;
iChar, iPos: integer;
begin
slTempoAccents := TStringList.Create;
slTempoAccentsStripped := TStringList.Create;
try
ParseLineToList(rsStrAccents, slTempoAccents);
ParseLineToList(rsStrAccentsStripped, slTempoAccentsStripped);
if slTempoAccents.Count <> slTempoAccentsStripped.Count then
raise Exception.Create('Unexpected situation in LoadInMemoryOurAccentLookupTableList!' + #$0A + 'Most probably problem in language file regarding conversion string with accents...');
gslAccents := TStringList.Create;
gslAccents.Assign(slTempoAccents);
gslAccents.Sort;
gslAccentsStripped := TStringList.Create;
for iChar := 0 to pred(gslAccents.Count) do
begin
iPos := slTempoAccents.IndexOf(gslAccents.Strings[iChar]);
if iPos <> -1 then
gslAccentsStripped.add(slTempoAccentsStripped.Strings[iPos])
else
raise Exception.Create('Unexpected situation in LoadInMemoryOurAccentLookupTableList! (Error in Rejavik)');
end;
if gslAccents.Count <> gslAccentsStripped.Count then
raise Exception.Create('Unexpected situation in LoadInMemoryOurAccentLookupTableList! (Error in Mexico)');
finally
FreeAndNil(slTempoAccents);
FreeAndNil(slTempoAccentsStripped);
end;
end;
{ FreeMemoryFromOurAccentLookupTableList }
procedure FreeMemoryFromOurAccentLookupTableList;
begin
if gslAccents <> nil then FreeAndNil(gslAccents);
if gslAccentsStripped <> nil then FreeAndNil(gslAccentsStripped);
end;
{ NormalizeAccentedChar }
function NormalizeAccentedChar(sInput: string): string;
var
iIndexChar, iPosChar: integer;
cWorkingChar: string;
begin
Result := '';
for iIndexChar := 1 to UTF8length(sInput) do
begin
cWorkingChar := UTF8Copy(sInput, iIndexChar, 1);
iPosChar := gslAccents.IndexOf(cWorkingChar);
if iPosChar = -1 then
Result := Result + cWorkingChar
else
Result := Result + gslAccentsStripped.Strings[iPosChar];
end;
end;
{ PosOfSubstrWithVersatileOptions }
// NOTE: Function will search "sSubString" inside the "sWholeString" and return the position where it found it.
// WARNING! Function is assuming the "sSubString" is already preformated adequately and won't do it.
// For example, if we do a search case insensitive, it is assumed you arrived here with "sSubString" already in UTF8UpperCase. So with "ABC" and not "abc".
// For example, if you search ignoring accent, it is assumed you arrived here with "sSubString" already without accents. So with "aei" and not "àéî".
// For example, if you search ignoring ligature, it is assumed you arrived here with "sSubString" already without ligature. So with "oe" and not with "œ".
// No need to preformat, obviously, the "sWholeString".
// ALL this is to speed up a little things since often we'll search the SAME string over and over in a whole string.
// We'll gain something preparing our "sWholeString" once for all AND THEN keep re-searhcing in many strings using the following routine.
// ALSO, because of the ligature possibility, the parameter "ActualCharFittedInInput" will be set according to the number of chars from the "sWholeString" that was used for finding the "sSubString".
// For example, if we search "oeu" inside "soeur", the return value will be 2 and the "ActualCharFittedInInput" will be set to 3...
// But if we search "oeu" inside "sœur", the return value will still be 2 BUT the "ActualCharFittedInInput" will be set to 2 since only two chars were required!
// The author of this doesn't know for other language, but for French, this is a nice routine! :-)
function PosOfSubstrWithVersatileOptions(sSubString, sWholeString: string; bCaseSensitive, bIgnoreAccent: boolean; var ActualCharFittedInInput: integer): integer;
var
sLocal: string;
iActualPos: integer;
iInnerResult: integer = 0;
begin
ActualCharFittedInInput := 0;
if bIgnoreAccent then sLocal := NormalizeAccentedChar(sWholeString) else sLocal := sWholeString;
if not bCaseSensitive then sLocal := UTF8UpperCase(sLocal);
iInnerResult := UTF8Pos(sSubString, sLocal);
if iInnerResult > 0 then
begin
iActualPos := 0;
sLocal := '';
while (UTF8Length(sLocal) < iInnerResult) and (iActualPos < length(sWholeString)) do
begin
Inc(iActualPos);
if bIgnoreAccent then
sLocal := sLocal + NormalizeAccentedChar(UTF8copy(sWholeString, iActualPos, 1))
else
sLocal := sLocal + UTF8copy(sWholeString, iActualPos, 1);
end;
Result := iActualPos;
//Once here, "iActualPos" holds the actual position of our substring in the string.
//Then, we now add the char one by one until it match what were where searching AND IT SHOULD MATCH because "iInnerResult" has a value.
sLocal := '';
while (UTF8Pos(sSubString, sLocal) = 0) and ((Result + ActualCharFittedInInput) <= UTF8Length(sWholeString)) do
begin
Inc(ActualCharFittedInInput);
sLocal := UTF8copy(sWholeString, Result, ActualCharFittedInInput);
if bIgnoreAccent then sLocal := NormalizeAccentedChar(sLocal) else sLocal := sLocal;
if not bCaseSensitive then sLocal := UTF8UpperCase(sLocal);
end;
end
else
begin
Result := 0;
end;
end;
end.