ADD: Find files - search text in Office XML (*.docx)

This commit is contained in:
Alexander Koblov 2021-04-11 16:53:41 +03:00
commit a57b98ddac
7 changed files with 201 additions and 4 deletions

View file

@ -411,7 +411,7 @@ object frmFindDlg: TfrmFindDlg
ItemHeight = 15
OnSelect = cmbEncodingSelect
Style = csDropDownList
TabOrder = 6
TabOrder = 7
end
object cmbFindText: TComboBoxWithDelItems
AnchorSideLeft.Control = CheksPanel
@ -518,7 +518,23 @@ object frmFindDlg: TfrmFindDlg
Width = 88
Caption = 'Hexadeci&mal'
OnChange = chkHexChange
TabOrder = 7
TabOrder = 8
end
object cbOffceXML: TCheckBox
AnchorSideLeft.Control = cbTextRegExp
AnchorSideLeft.Side = asrBottom
AnchorSideTop.Control = cbTextRegExp
Left = 592
Height = 24
Hint = 'Offce XML (*.docx)'
Top = 67
Width = 93
BorderSpacing.Left = 15
Caption = 'Offce XML'
OnChange = cbOffceXMLChange
ParentShowHint = False
ShowHint = True
TabOrder = 6
end
end
end

View file

@ -25,6 +25,8 @@
{"hash":35720169,"name":"tfrmfinddlg.cbreplacetext.caption","sourcebytes":[82,101,38,112,108,97,99,101,32,98,121],"value":"Re&place by"},
{"hash":137727326,"name":"tfrmfinddlg.cbtextregexp.caption","sourcebytes":[82,101,103,38,117,108,97,114,32,101,120,112,114,101,115,115,105,111,110],"value":"Reg&ular expression"},
{"hash":259470556,"name":"tfrmfinddlg.chkhex.caption","sourcebytes":[72,101,120,97,100,101,99,105,38,109,97,108],"value":"Hexadeci&mal"},
{"hash":233337209,"name":"tfrmfinddlg.cboffcexml.hint","sourcebytes":[79,102,102,99,101,32,88,77,76,32,40,42,46,100,111,99,120,41],"value":"Offce XML (*.docx)"},
{"hash":211225308,"name":"tfrmfinddlg.cboffcexml.caption","sourcebytes":[79,102,102,99,101,32,88,77,76],"value":"Offce XML"},
{"hash":197676484,"name":"tfrmfinddlg.tsadvanced.caption","sourcebytes":[65,100,118,97,110,99,101,100],"value":"Advanced"},
{"hash":122109610,"name":"tfrmfinddlg.cbdatefrom.caption","sourcebytes":[38,68,97,116,101,32,102,114,111,109,58],"value":"&Date from:"},
{"hash":34324922,"name":"tfrmfinddlg.cbnotolderthan.caption","sourcebytes":[78,38,111,116,32,111,108,100,101,114,32,116,104,97,110,58],"value":"N&ot older than:"},

View file

@ -99,6 +99,7 @@ type
cbTextRegExp: TCheckBox;
cbFindInArchive: TCheckBox;
cbOpenedTabs: TCheckBox;
cbOffceXML: TCheckBox;
chkDuplicateContent: TCheckBox;
chkDuplicateSize: TCheckBox;
chkDuplicateHash: TCheckBox;
@ -211,6 +212,7 @@ type
procedure cbDateFromChange(Sender: TObject);
procedure cbDateToChange(Sender: TObject);
procedure cbFindInArchiveChange(Sender: TObject);
procedure cbOffceXMLChange(Sender: TObject);
procedure cbOpenedTabsChange(Sender: TObject);
procedure cbPartialNameSearchChange(Sender: TObject);
procedure cbRegExpChange(Sender: TObject);
@ -805,6 +807,7 @@ begin
EnableControl(cbReplaceText, cbFindText.Checked and not cbFindInArchive.Checked);
EnableControl(cbNotContainingText, cbFindText.Checked);
EnableControl(cbTextRegExp, cbFindText.Checked);
EnableControl(cbOffceXML, cbFindText.Checked);
lblEncoding.Enabled := cbFindText.Checked;
cbReplaceText.Checked := False;
cmbEncodingSelect(nil);
@ -999,6 +1002,16 @@ begin
cbReplaceTextChange(cbReplaceText);
end;
procedure TfrmFindDlg.cbOffceXMLChange(Sender: TObject);
begin
if cbOffceXML.Checked then
begin
chkHex.Checked:= False;
cbReplaceText.Checked:= False;
end;
cbReplaceText.Enabled:= not (chkHex.Checked or cbOffceXML.Checked);
end;
{ TfrmFindDlg.cbOpenedTabsChange }
procedure TfrmFindDlg.cbOpenedTabsChange(Sender: TObject);
begin
@ -1088,6 +1101,7 @@ begin
begin
cbCaseSens.Tag := Integer(cbCaseSens.Checked);
end;
cbOffceXML.Checked:= False;
cbReplaceText.Checked:= False;
end
else if not cbCaseSens.Enabled then
@ -1095,7 +1109,7 @@ begin
cbCaseSens.Checked := Boolean(cbCaseSens.Tag);
end;
cmbEncoding.Enabled:= not chkHex.Checked;
cbReplaceText.Enabled:= not chkHex.Checked;
cbReplaceText.Enabled:= not (chkHex.Checked or cbOffceXML.Checked);
cmbEncodingSelect(cmbEncoding);
end;
@ -1189,6 +1203,7 @@ begin
NotContainingText := cbNotContainingText.Checked;
TextRegExp := cbTextRegExp.Checked;
TextEncoding := cmbEncoding.Text;
OfficeXML := cbOffceXML.Checked;
{ Duplicates }
Duplicates:= chkDuplicates.Checked;
DuplicateName:= chkDuplicateName.Checked;
@ -2291,6 +2306,7 @@ begin
cbNotContainingText.Checked := NotContainingText;
cbTextRegExp.Checked := TextRegExp;
cmbEncoding.Text := TextEncoding;
cbOffceXML.Checked := OfficeXML;
if cbFindInArchive.Enabled then
begin

View file

@ -86,6 +86,7 @@ type
NotContainingText: Boolean;
TextRegExp: Boolean;
TextEncoding: String;
OfficeXML: Boolean;
{ Duplicates }
Duplicates: Boolean;
DuplicateName: Boolean;

View file

@ -53,6 +53,7 @@ type
FFoundFile:String;
FCurrentDepth: Integer;
FTextSearchType: TTextSearch;
FSearchText: String;
FSearchTemplate: TSearchTemplateRec;
FSelectedFiles: TStringList;
FFileChecks: TFindFileChecks;
@ -112,7 +113,7 @@ implementation
uses
LCLProc, LazUtf8, StrUtils, LConvEncoding, DCStrUtils,
uLng, DCClassesUtf8, uFindMmap, uGlobs, uShowMsg, DCOSUtils, uOSUtils, uHash,
uLog, WcxPlugin, Math, uDCUtils, uConvEncoding, DCDateTimeUtils;
uLog, WcxPlugin, Math, uDCUtils, uConvEncoding, DCDateTimeUtils, uOfficeXML;
function ProcessDataProcAG(FileName: PAnsiChar; Size: LongInt): LongInt; dcpcall;
begin
@ -157,6 +158,8 @@ begin
if IsFindText then
begin
FSearchText := FindText;
if HexValue then
begin
TextEncoding := EncodingAnsi;
@ -352,6 +355,21 @@ begin
Result := False;
if sData = '' then Exit;
if FSearchTemplate.OfficeXML and MatchesMask(sFileName, '*.docx') then
begin
if LoadFromOffice(sFileName, S) then
begin
if bRegExp then
Result:= uRegExprW.ExecRegExpr(UTF8ToUTF16(FSearchText), UTF8ToUTF16(S))
else if FSearchTemplate.CaseSensitive then
Result:= PosMem(Pointer(S), Length(S), 0, FSearchText, False, False) <> Pointer(-1)
else begin
Result:= PosMemU(Pointer(S), Length(S), 0, FSearchText, False) <> Pointer(-1);
end;
end;
Exit;
end;
// Simple regular expression search (don't work for very big files)
if bRegExp then
begin

140
src/uofficexml.pas Normal file
View file

@ -0,0 +1,140 @@
{
Double commander
-------------------------------------------------------------------------
Load text from office xml (*.docx)
Copyright (C) 2021 Alexander Koblov (alexx2000@mail.ru)
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
}
unit uOfficeXML;
{$mode objfpc}{$H+}
interface
uses
Classes, SysUtils;
function LoadFromOffice(const FileName: String; out AText: String): Boolean;
implementation
uses
Unzip, ZipUtils, Laz2_DOM, laz2_XMLRead;
procedure ProcessNodes(var S: String; ANode: TDOMNode);
var
I: Integer;
ASubNode: TDOMNode;
ANodeName: DOMString;
begin
for I:= 0 to ANode.ChildNodes.Count - 1 do
begin
ASubNode := ANode.ChildNodes.Item[I];
ANodeName := ASubNode.NodeName;
if (ANodeName = 'w:t') then
begin
if Assigned(ASubNode.FirstChild) then
S += ASubNode.FirstChild.NodeValue;
end
else if (ANodeName = 'w:p') then
S += LineEnding + LineEnding
else if (ANodeName = 'w:br') or (ANodeName = 'w:cr') then
S += LineEnding
else if (ANodeName = 'w:tab') then
S += #9;
if ASubNode.ChildNodes.Count > 0 then
ProcessNodes(S, ASubNode);
end;
end;
function ExtractFile(ZipFile: unzFile; FileName: PAnsiChar; MemoryStream: TMemoryStream): Boolean;
var
ASize: LongInt;
FileInfo: unz_file_info;
begin
Result:= unzGetCurrentFileInfo(ZipFile, @FileInfo, nil, 0, nil, 0, nil, 0) = UNZ_OK;
if Result then
begin
MemoryStream.SetSize(FileInfo.uncompressed_size);
if unzOpenCurrentFile(ZipFile) = UNZ_OK then
begin
ASize:= unzReadCurrentFile(ZipFile, MemoryStream.Memory, FileInfo.uncompressed_size);
Result:= (ASize = FileInfo.uncompressed_size);
unzCloseCurrentFile(ZipFile);
end;
end;
end;
procedure ProcessFile(ZipFile: unzFile; const FileName: String; var AText: String);
var
ADoc: TXMLDocument;
AStream: TMemoryStream;
begin
if unzLocateFile(ZipFile, PAnsiChar(FileName), 0) = UNZ_OK then
begin
AStream:= TMemoryStream.Create;
try
if ExtractFile(ZipFile, PAnsiChar(FileName), AStream) then
begin
ReadXMLFile(ADoc, AStream, [xrfPreserveWhiteSpace]);
if Assigned (ADoc) then
begin
ProcessNodes(AText, ADoc.DocumentElement);
ADoc.Free;
end;
end;
finally
AStream.Free;
end;
end;
end;
function LoadFromOffice(const FileName: String; out AText: String): Boolean;
const
HEADER_XML = 'word/header%d.xml';
FOOTER_XML = 'word/footer%d.xml';
var
Index: Integer;
ZipFile: unzFile;
begin
AText:= EmptyStr;
ZipFile:= unzOpen(PAnsiChar(FileName));
Result:= Assigned(ZipFile);
if Result then
try
// Read headers
for Index:= 0 to 9 do
begin
ProcessFile(ZipFile, Format(HEADER_XML, [Index]), AText);
end;
// Read body
ProcessFile(ZipFile, 'word/document.xml', AText);
// Read footers
for Index:= 0 to 9 do
begin
ProcessFile(ZipFile, Format(FOOTER_XML, [Index]), AText);
end;
Result:= Length(AText) > 0;
finally
unzClose(ZipFile);
end;
end;
end.

View file

@ -231,10 +231,12 @@ begin
IsReplaceText:= AConfig.GetValue(ANode, 'IsReplaceText', False);
if IsReplaceText then
ReplaceText:= AConfig.GetValue(ANode, 'ReplaceText', '');
// text search options
HexValue:= AConfig.GetValue(ANode, 'HexValue', False);
CaseSensitive:= AConfig.GetValue(ANode, 'CaseSensitive', False);
NotContainingText:= AConfig.GetValue(ANode, 'NotContainingText', False);
TextRegExp:= AConfig.GetValue(ANode, 'TextRegExp', False);
OfficeXML:= AConfig.GetValue(ANode, 'OfficeXML', False);
TextEncoding:= AConfig.GetValue(ANode, 'TextEncoding', '');
if TextEncoding = 'UTF-8BOM' then TextEncoding:= 'UTF-8';
if TextEncoding = 'UCS-2LE' then TextEncoding:= 'UTF-16LE';
@ -336,11 +338,13 @@ begin
AConfig.AddValue(SubNode, 'IsReplaceText', IsReplaceText);
if IsReplaceText then
AConfig.AddValue(SubNode, 'ReplaceText', ReplaceText);
// text search options
AConfig.AddValue(SubNode, 'HexValue', HexValue);
AConfig.AddValue(SubNode, 'CaseSensitive', CaseSensitive);
AConfig.AddValue(SubNode, 'NotContainingText', NotContainingText);
AConfig.AddValue(SubNode, 'TextRegExp', TextRegExp);
AConfig.AddValue(SubNode, 'TextEncoding', TextEncoding);
AConfig.AddValue(SubNode, 'OfficeXML', OfficeXML);
// duplicates
Node := AConfig.AddNode(SubNode, 'Duplicates');
AConfig.SetAttr(Node, 'Enabled', Duplicates);