ADD: Case insensitive text search in UTF-16 files

This commit is contained in:
Alexander Koblov 2016-08-02 09:31:37 +00:00
commit 8a95b25684
5 changed files with 154 additions and 25 deletions

View file

@ -589,20 +589,6 @@ var
AEncoding: String;
begin
AEncoding:= NormalizeEncoding(cmbEncoding.Text);
if (AEncoding = EncodingUCS2LE) or (AEncoding = EncodingUCS2BE) then
begin
if cbCaseSens.Enabled then
begin
cbCaseSens.Tag:= Integer(cbCaseSens.Checked);
cbCaseSens.Checked:= True;
cbCaseSens.Enabled:= False;
end;
end
else if not cbCaseSens.Enabled then
begin
cbCaseSens.Checked:= Boolean(cbCaseSens.Tag);
cbCaseSens.Enabled:= True;
end;
cbTextRegExp.Enabled := (AEncoding = EncodingAnsi);
if not cbTextRegExp.Enabled then cbTextRegExp.Checked:= False;
end;
@ -829,7 +815,20 @@ end;
procedure TfrmFindDlg.cbTextRegExpChange(Sender: TObject);
begin
if cbTextRegExp.Checked then cbCaseSens.Checked:= False;
if cbTextRegExp.Checked then
begin
if cbCaseSens.Enabled then
begin
cbCaseSens.Tag:= Integer(cbCaseSens.Checked);
cbCaseSens.Checked:= False;
cbCaseSens.Enabled:= False;
end;
end
else if not cbCaseSens.Enabled then
begin
cbCaseSens.Checked:= Boolean(cbCaseSens.Tag);
cbCaseSens.Enabled:= True;
end;
end;
procedure TfrmFindDlg.cbSelectedFilesChange(Sender: TObject);

View file

@ -2025,6 +2025,15 @@ begin
bTextFound := (PAnsiAddr <> Pointer(-1));
if bTextFound then FLastSearchPos := PAnsiAddr - ViewerControl.GetDataAdr;
end
// Using special case insensitive UTF-16 search algorithm
else if (ViewerControl.Encoding in [veUtf16le, veUtf16be, veUcs2le, veUcs2be]) then
begin
PAnsiAddr := PosMemW(ViewerControl.GetDataAdr, ViewerControl.FileSize,
FLastSearchPos, sSearchTextA, bSearchBackwards,
ViewerControl.Encoding in [veUtf16le, veUcs2le]);
bTextFound := (PAnsiAddr <> Pointer(-1));
if bTextFound then FLastSearchPos := PAnsiAddr - ViewerControl.GetDataAdr;
end
// Using very slow search algorithm
else if (ViewerControl.Encoding in ViewerEncodingMultiByte) or bSearchBackwards then
begin

View file

@ -32,7 +32,7 @@ uses
Classes, SysUtils, DCBasicTypes, uFile;
type
TTextSearch = (tsAnsi, tsUtf8, tsOther);
TTextSearch = (tsAnsi, tsUtf8, tsUtf16le, tsUtf16be, tsOther);
TTimeUnit = (tuSecond, tuMinute, tuHour, tuDay, tuWeek, tuMonth, tuYear);
TFileSizeUnit = (suBytes, suKilo, suMega, suGiga, suTera);
TPluginOperator = (poEqual, poNotEqual, poMore, poLess, poMoreEqual, poLessEqual,

View file

@ -42,6 +42,9 @@ function PosMem(pDataAddr: PChar; iDataLength, iStartPos: PtrInt; const sSearchT
function PosMemU(pDataAddr: PChar; iDataLength, iStartPos: PtrInt;
const sSearchText: String; bSearchBackwards: Boolean): Pointer;
function PosMemW(pDataAddr: PChar; iDataLength, iStartPos: PtrInt;
const sSearchText: String; bSearchBackwards, bLittleEndian: Boolean): Pointer;
{en
Searches a file for a string using memory mapping.
@ -60,10 +63,12 @@ function FindMmap(const sFileName:String; const sFindData:String; bCase:Boolean;
function FindMmapU(const sFileName: String; const sFindData: String): Integer;
function FindMmapW(const sFileName: String; const sFindData: String; bLittleEndian: Boolean): Integer;
implementation
uses
DCOSUtils, UnicodeUtils, LazUTF8, StrUtils;
SysUtils, DCOSUtils, UnicodeUtils, LazUTF8, StrUtils, DCStrUtils;
function PosMem(pDataAddr: PChar; iDataLength, iStartPos: PtrInt; const sSearchText: String;
bCaseSensitive: Boolean; bSearchBackwards: Boolean): Pointer;
@ -195,8 +200,86 @@ begin
end;
end;
function FindMmap(const sFileName, sFindData:String; bCase:Boolean;
Abort: TAbortFunction):Integer;
function PosMemW(pDataAddr: PChar; iDataLength, iStartPos: PtrInt;
const sSearchText: String; bSearchBackwards, bLittleEndian: Boolean): Pointer;
const
BUFFER_SIZE = 4096;
var
iSize: PtrInt;
iLength: Integer;
iTextPos: Integer;
bSwapEndian: Boolean;
sTextBuffer: UnicodeString;
sLowerCase: UnicodeString;
begin
Result := Pointer(-1);
iLength:= Length(sSearchText);
bSwapEndian:= {$IFDEF ENDIAN_BIG}bLittleEndian{$ELSE}not bLittleEndian{$ENDIF};
if bSearchBackwards then
begin
iSize:= iStartPos;
if iLength > iSize then Exit;
sLowerCase:= PUnicodeChar(Pointer(sSearchText + #0));
if bSwapEndian then Utf16SwapEndian(sLowerCase);
sLowerCase:= UnicodeLowerCase(sLowerCase);
// While text size > buffer size
while iStartPos > BUFFER_SIZE do
begin
iStartPos:= iStartPos - BUFFER_SIZE;
SetString(sTextBuffer, PUnicodeChar(pDataAddr + iStartPos), BUFFER_SIZE div 2);
if bSwapEndian then Utf16SwapEndian(sTextBuffer);
sTextBuffer:= UnicodeLowerCase(sTextBuffer);
iTextPos:= RPos(sLowerCase, sTextBuffer);
if iTextPos > 0 then
Exit(pDataAddr + iStartPos + iTextPos * 2 - 2)
else begin
// Shift text buffer
iStartPos:= iStartPos + iLength;
end;
end;
// Process remaining buffer
if iLength > iStartPos then Exit;
SetString(sTextBuffer, PUnicodeChar(pDataAddr), iStartPos div 2);
if bSwapEndian then Utf16SwapEndian(sTextBuffer);
sTextBuffer:= UnicodeLowerCase(sTextBuffer);
iTextPos:= RPos(sLowerCase, sTextBuffer);
if iTextPos > 0 then Result:= pDataAddr + iTextPos * 2 - 2
end
else begin
iSize:= iDataLength - iStartPos;
if iLength > iSize then Exit;
sLowerCase:= PUnicodeChar(Pointer(sSearchText + #0));
if bSwapEndian then Utf16SwapEndian(sLowerCase);
sLowerCase:= UnicodeLowerCase(sLowerCase);
// While text size > buffer size
while iSize > BUFFER_SIZE do
begin
SetString(sTextBuffer, PUnicodeChar(pDataAddr + iStartPos), BUFFER_SIZE div 2);
if bSwapEndian then Utf16SwapEndian(sTextBuffer);
sTextBuffer:= UnicodeLowerCase(sTextBuffer);
iTextPos:= Pos(sLowerCase, sTextBuffer);
if iTextPos > 0 then
Exit(pDataAddr + iStartPos + iTextPos * 2 - 2)
else begin
// Shift text buffer
iStartPos:= iStartPos + (BUFFER_SIZE - iLength);
end;
iSize:= iDataLength - iStartPos;
end;
// Process remaining buffer
if iLength > iSize then Exit;
SetString(sTextBuffer, PUnicodeChar(pDataAddr + iStartPos), iSize div 2);
if bSwapEndian then Utf16SwapEndian(sTextBuffer);
sTextBuffer:= UnicodeLowerCase(sTextBuffer);
iTextPos:= Pos(sLowerCase, sTextBuffer);
if iTextPos > 0 then Result:= pDataAddr + iStartPos + iTextPos * 2 - 2;
end;
end;
function FindMmap(const sFileName: String; const sFindData: String;
bCase: Boolean; Abort: TAbortFunction): Integer;
function PosMem(pAdr:PChar; iLength:Integer):Pointer;
var
@ -278,4 +361,25 @@ begin
end;
end;
function FindMmapW(const sFileName: String; const sFindData: String; bLittleEndian: Boolean): Integer;
var
fmr : TFileMapRec;
begin
Result := -1;
if MapFile(sFileName, fmr) then
begin
try
begin
if PosMemW(fmr.MappedFile, fmr.FileSize, 0, sFindData, False, bLittleEndian) <> Pointer(-1) then
Result := 1
else
Result := 0;
end;
finally
UnMapFile(fmr);
end;
end;
end;
end.

View file

@ -119,8 +119,17 @@ begin
FTextSearchType := tsAnsi;
RecodeTable := InitRecodeTable(TextEncoding, CaseSensitive);
end
else if (CaseSensitive = False) and ((TextEncoding = EncodingUTF8) or (TextEncoding = EncodingUTF8BOM)) then
FTextSearchType:= tsUtf8
else if (CaseSensitive = False) then
begin
if ((TextEncoding = EncodingUTF8) or (TextEncoding = EncodingUTF8BOM)) then
FTextSearchType:= tsUtf8
else if (TextEncoding = EncodingUCS2LE) then
FTextSearchType:= tsUtf16le
else if (TextEncoding = EncodingUCS2BE) then
FTextSearchType:= tsUtf16be
else
FTextSearchType:= tsOther;
end
else begin
FTextSearchType:= tsOther;
end;
@ -282,9 +291,11 @@ begin
begin
// Memory mapping should be slightly faster and use less memory
case FTextSearchType of
tsAnsi: lastPos:= FindMmapBM(sFileName, sData, RecodeTable, @IsAborting);
tsUtf8: lastPos:= FindMmapU(sFileName, sData)
else lastPos:= FindMmap(sFileName, sData, bCase, @IsAborting);
tsAnsi: lastPos:= FindMmapBM(sFileName, sData, RecodeTable, @IsAborting);
tsUtf8: lastPos:= FindMmapU(sFileName, sData);
tsUtf16le: lastPos:= FindMmapW(sFileName, sData, True);
tsUtf16be: lastPos:= FindMmapW(sFileName, sData, False);
else lastPos:= FindMmap(sFileName, sData, bCase, @IsAborting);
end;
case lastPos of
0 : Exit(False);
@ -331,7 +342,13 @@ begin
begin
if PosMemU(@Buffer[0], DataRead + sDataLength - 1, 0, sData, False) <> Pointer(-1) then
Exit(True);
end
end;
tsUtf16le,
tsUtf16be:
begin
if PosMemW(@Buffer[0], DataRead + sDataLength - 1, 0, sData, False, FTextSearchType = tsUtf16le) <> Pointer(-1) then
Exit(True);
end;
else
begin
if PosMem(@Buffer[0], DataRead + sDataLength - 1, 0, sData, bCase, False) <> Pointer(-1) then