doublecmd/src/uofficexml.pas
2021-05-23 15:22:49 +03:00

618 lines
16 KiB
ObjectPascal

{
Double commander
-------------------------------------------------------------------------
Load text from office xml (*.docx, *.odt)
Copyright (C) 2021 Alexander Koblov (alexx2000@mail.ru)
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
}
unit uOfficeXML;
{$mode objfpc}{$H+}
interface
uses
Classes, SysUtils, uMasks;
var
OfficeMask: TMaskList;
const
OFFICE_FILTER = '(*.docx, *.xlsx, *.odt, *.ods)';
function LoadFromOffice(const FileName: String; out AText: String): Boolean;
implementation
uses
Math, Unzip, ZipUtils, Laz2_DOM, laz2_XMLRead, fpsNumFormat, fpsCommon, fgl;
type
TIntegerMap = class(specialize TFPGMap<Integer, TsNumFormatParams>);
function ExtractFile(ZipFile: unzFile; MemoryStream: TMemoryStream): Boolean;
var
ASize: LongInt;
FileInfo: unz_file_info;
begin
Result:= unzGetCurrentFileInfo(ZipFile, @FileInfo, nil, 0, nil, 0, nil, 0) = UNZ_OK;
if Result then
begin
MemoryStream.SetSize(FileInfo.uncompressed_size);
if unzOpenCurrentFile(ZipFile) = UNZ_OK then
begin
ASize:= unzReadCurrentFile(ZipFile, MemoryStream.Memory, FileInfo.uncompressed_size);
Result:= (ASize = FileInfo.uncompressed_size);
unzCloseCurrentFile(ZipFile);
end;
end;
end;
{ Office Open XML }
procedure ProcessOfficeOpenNodes(var S: String; ANode: TDOMNode);
var
I: Integer;
ASubNode: TDOMNode;
ANodeName: DOMString;
begin
for I:= 0 to ANode.ChildNodes.Count - 1 do
begin
ASubNode := ANode.ChildNodes.Item[I];
ANodeName := ASubNode.NodeName;
if (ANodeName = 'w:t') then
begin
if Assigned(ASubNode.FirstChild) then
S += ASubNode.FirstChild.NodeValue;
end
else if (ANodeName = 'w:p') then
S += LineEnding + LineEnding
else if (ANodeName = 'w:br') or (ANodeName = 'w:cr') then
S += LineEnding
else if (ANodeName = 'w:tab') then
S += #9;
if ASubNode.ChildNodes.Count > 0 then
ProcessOfficeOpenNodes(S, ASubNode);
end;
end;
procedure ProcessFile(ZipFile: unzFile; const FileName: String; var AText: String);
var
ADoc: TXMLDocument;
AStream: TMemoryStream;
begin
if unzLocateFile(ZipFile, PAnsiChar(FileName), 0) = UNZ_OK then
begin
AStream:= TMemoryStream.Create;
try
if ExtractFile(ZipFile, AStream) then
begin
ReadXMLFile(ADoc, AStream, [xrfPreserveWhiteSpace]);
if Assigned (ADoc) then
begin
ProcessOfficeOpenNodes(AText, ADoc.DocumentElement);
ADoc.Free;
end;
end;
finally
AStream.Free;
end;
end;
end;
function LoadFromOfficeOpen(const FileName: String; out AText: String): Boolean;
const
HEADER_XML = 'word/header%d.xml';
FOOTER_XML = 'word/footer%d.xml';
var
Index: Integer;
ZipFile: unzFile;
begin
AText:= EmptyStr;
ZipFile:= unzOpen(PAnsiChar(FileName));
Result:= Assigned(ZipFile);
if Result then
try
// Read headers
for Index:= 0 to 9 do
begin
ProcessFile(ZipFile, Format(HEADER_XML, [Index]), AText);
end;
// Read body
ProcessFile(ZipFile, 'word/document.xml', AText);
// Read footers
for Index:= 0 to 9 do
begin
ProcessFile(ZipFile, Format(FOOTER_XML, [Index]), AText);
end;
Result:= Length(AText) > 0;
finally
unzClose(ZipFile);
end;
end;
{ Open Document Format }
procedure ProcessOpenOfficeNodes(var S: String; ANode: TDOMNode);
var
I: Integer;
ASubNode: TDOMNode;
ANodeName: DOMString;
procedure ParseSubNode(ANode: TDOMNode);
var
J: Integer;
ASubNode: TDOMNode;
begin
for J:= 0 to ANode.ChildNodes.Count - 1 do
begin
ASubNode := ANode.ChildNodes.Item[J];
ANodeName := ASubNode.NodeName;
if ANodeName = 'text:s' then
S += ' '
else if ANodeName = 'text:tab' then
S += #9
else if ANodeName = 'text:line-break' then
S += LineEnding
else if (ASubNode.NodeType = TEXT_NODE) then
S += ASubNode.NodeValue
else begin
ParseSubNode(ASubNode);
end;
end;
end;
begin
for I:= 0 to ANode.ChildNodes.Count - 1 do
begin
ASubNode := ANode.ChildNodes.Item[I];
ANodeName := ASubNode.NodeName;
if (ANodeName = 'text:p') or (ANodeName = 'text:h')then
begin
if ASubNode.ChildNodes.Count > 0 then
begin
ParseSubNode(ASubNode);
S += LineEnding;
end;
end
else if ASubNode.ChildNodes.Count > 0 then
ProcessOpenOfficeNodes(S, ASubNode);
end;
end;
function LoadFromOpenOffice(const FileName: String; out AText: String): Boolean;
const
CONTENT_XML = 'content.xml';
var
ZipFile: unzFile;
ADoc: TXMLDocument;
AStream: TMemoryStream;
begin
Result:= False;
AText:= EmptyStr;
ZipFile:= unzOpen(PAnsiChar(FileName));
if Assigned(ZipFile) then
try
if unzLocateFile(ZipFile, CONTENT_XML, 0) = UNZ_OK then
begin
AStream:= TMemoryStream.Create;
try
if ExtractFile(ZipFile, AStream) then
begin
ReadXMLFile(ADoc, AStream, [xrfPreserveWhiteSpace]);
if Assigned (ADoc) then
begin
ProcessOpenOfficeNodes(AText, ADoc.DocumentElement);
ADoc.Free;
end;
end;
finally
AStream.Free;
end;
end;
Result:= Length(AText) > 0;
finally
unzClose(ZipFile);
end;
end;
{ Office Open XML Excel }
function FindNode(ANode: TDOMNode; const ANodeName: String): TDOMNode;
begin
Result:= ANode.FindNode(ANodeName);
if Result = nil then
Result:= ANode.FindNode('x:' + ANodeName);
end;
function ParseSubNode(ANode: TDOMNode): String;
var
ASubNode: TDOMNode;
begin
Result:= EmptyStr;
ASubNode:= ANode.FirstChild;
while Assigned(ASubNode) do
begin
if (ASubNode.NodeType = TEXT_NODE) then
Result+= ASubNode.NodeValue
else begin
Result+= ParseSubNode(ASubNode);
end;
ASubNode:= ASubNode.NextSibling;
end;
end;
function GetAttrValue(ANode: TDOMNode; AName: String): String;
begin
Result:= EmptyStr;
if (ANode = nil) or (ANode.Attributes = nil) then Exit;
ANode:= ANode.Attributes.GetNamedItem(AName);
if Assigned(ANode) then Result:= ANode.NodeValue;
end;
procedure ParseStyles(ZipFile: unzFile; Styles: TIntegerMap; Storage: TsNumFormatList);
const
STYLES_XML = 'xl/styles.xml';
var
AName: String;
Index: Integer;
Style: Integer;
ADoc: TXMLDocument;
Formats: TStringList;
AStream: TMemoryStream;
ANode, ASubNode, AFormat: TDOMNode;
begin
Formats:= TStringList.Create;
try
if unzLocateFile(ZipFile, STYLES_XML, 0) = UNZ_OK then
begin
AStream:= TMemoryStream.Create;
try
if ExtractFile(ZipFile, AStream) then
begin
ReadXMLFile(ADoc, AStream, [xrfPreserveWhiteSpace]);
if Assigned (ADoc) then
begin
AddBuiltInBiffFormats(Formats, FormatSettings, 163);
ANode:= ADoc.DocumentElement;
if Assigned(ANode) then
begin
ASubNode:= FindNode(ANode, 'numFmts');
if Assigned(ASubNode) then
begin
AFormat:= ASubNode.FirstChild;
while Assigned(AFormat) do
begin
AName:= AFormat.NodeName;
if (AName = 'numFmt') or (AName = 'x:numFmt') then
begin
AName:= GetAttrValue(AFormat, 'numFmtId');
if TryStrToInt(AName, Index) then
begin
while Formats.Count <= Index do
Formats.Add(EmptyStr);
Formats[Index]:= GetAttrValue(AFormat, 'formatCode');
end;
end;
AFormat:= AFormat.NextSibling;
end;
end;
ASubNode:= FindNode(ANode, 'cellXfs');
if Assigned(ASubNode) then
begin
Style:= 0;
AFormat:= ASubNode.FirstChild;
while Assigned(AFormat) do
begin
AName:= AFormat.NodeName;
if (AName = 'xf') or (AName = 'x:xf') then
begin
AName:= GetAttrValue(AFormat, 'numFmtId');
if TryStrToInt(AName, Index) then
begin
AName:= GetAttrValue(AFormat, 'applyNumberFormat');
if StrToBoolDef(AName, True) then
begin
if InRange(Index, 0, Formats.Count - 1) then
begin
AName:= Formats[Index];
if not SameText(AName, 'General') then
begin
Index:= Storage.AddFormat(AName);
Styles.Add(Style, Storage.Items[Index]);
end;
end;
end;
end;
Inc(Style);
end;
AFormat:= AFormat.NextSibling;
end;
end;
end;
ADoc.Free;
end;
end;
finally
AStream.Free;
end;
end;
finally
Formats.Free;
end;
end;
function ParseWorkbook(ZipFile: unzFile; Sheets: TStringList): Boolean;
const
CONTENT_XML = 'xl/workbook.xml';
var
AName: String;
ADoc: TXMLDocument;
AStream: TMemoryStream;
ANode, ASubNode: TDOMNode;
begin
if unzLocateFile(ZipFile, CONTENT_XML, 0) = UNZ_OK then
begin
AStream:= TMemoryStream.Create;
try
if ExtractFile(ZipFile, AStream) then
begin
ReadXMLFile(ADoc, AStream, [xrfPreserveWhiteSpace]);
if Assigned(ADoc) then
begin
ANode:= FindNode(ADoc.DocumentElement, 'sheets');
if Assigned(ANode) then
begin
ASubNode:= ANode.FirstChild;
while Assigned(ASubNode) do
begin
AName:= ASubNode.NodeName;
if (AName = 'sheet') or (AName = 'x:sheet') then
begin
AName:= GetAttrValue(ASubNode, 'name');
Sheets.Add(AName);
end;
ASubNode:= ASubNode.NextSibling;
end;
end;
ADoc.Free;
end;
end;
finally
AStream.Free;
end;
end;
Result:= (Sheets.Count > 0);
end;
procedure ParseSharedStrings(ZipFile: unzFile; Strings: TStringList);
const
STRINGS_XML = 'xl/sharedStrings.xml';
var
AName: String;
ADoc: TXMLDocument;
AStream: TMemoryStream;
ANode, ASubNode: TDOMNode;
begin
if unzLocateFile(ZipFile, STRINGS_XML, 0) = UNZ_OK then
begin
AStream:= TMemoryStream.Create;
try
if ExtractFile(ZipFile, AStream) then
begin
ReadXMLFile(ADoc, AStream, [xrfPreserveWhiteSpace]);
if Assigned (ADoc) then
begin
ANode:= ADoc.DocumentElement;
if Assigned(ANode) then
begin
ASubNode:= ANode.FirstChild;
while Assigned(ASubNode) do
begin
AName:= ASubNode.NodeName;
if (AName = 'si') or (AName = 'x:si') then
begin
Strings.Add(ParseSubNode(ASubNode));
end;
ASubNode:= ASubNode.NextSibling;
end;
end;
ADoc.Free;
end;
end;
finally
AStream.Free;
end;
end;
end;
procedure ParseCell(ACell: TDOMNode; Strings: TStringList; Styles: TIntegerMap; var Text: String);
var
D: Double;
K: Integer;
ATemp: String;
AType: String;
Index: Integer;
AStyle: String;
AValue: TDOMNode;
F: TsNumFormatParams;
Format: TFormatSettings;
begin
AType:= GetAttrValue(ACell, 't');
if (AType = 'inlineStr') then
AValue:= FindNode(ACell, 'is')
else begin
AValue:= FindNode(ACell, 'v');
end;
if Assigned(AValue) then
begin
ATemp:= ParseSubNode(AValue);
// Shared string
if AType = 's' then
begin
K:= StrToIntDef(ATemp, -1);
if InRange(K, 0, Strings.Count - 1) then
Text+= Strings[K];
end
// Inline string or formula
else if (AType = 'inlineStr') or (AType = 'str') then
begin
Text+= ATemp;
end
// Number or general
else if (AType = 'n') or (AType = '') then
begin
AStyle:= GetAttrValue(ACell, 's');
if not TryStrToInt(AStyle, K) then
Text+= ATemp
else begin
Index:= Styles.IndexOf(K);
if (Index < 0) then
Text+= ATemp
else begin
F:= Styles.Data[Index];
Format:= FormatSettings;
Format.DecimalSeparator:= '.';
if not TryStrToFloat(ATemp, D, Format) then
Text+= ATemp
else
Text+= ConvertFloatToStr(D, F, FormatSettings);
end;
end;
end;
end;
end;
procedure ParseSheet(ZipFile: unzFile; Sheet: Integer; Strings: TStringList; Styles: TIntegerMap; var Text: String);
const
SHEET_XML = 'xl/worksheets/sheet%d.xml';
var
AName: String;
ADoc: TXMLDocument;
AStream: TMemoryStream;
ANode, ARow, ACell: TDOMNode;
begin
AName:= Format(SHEET_XML, [Sheet]);
if unzLocateFile(ZipFile, PAnsiChar(AName), 0) = UNZ_OK then
begin
AStream:= TMemoryStream.Create;
try
if ExtractFile(ZipFile, AStream) then
begin
ReadXMLFile(ADoc, AStream, [xrfPreserveWhiteSpace]);
if Assigned(ADoc) then
begin
ANode:= FindNode(ADoc.DocumentElement, 'sheetData');
if Assigned(ANode) then
begin
ARow:= ANode.FirstChild;
while Assigned(ARow) do
begin
AName:= ARow.NodeName;
if (AName = 'row') or (AName = 'x:row') then
begin
ACell:= ARow.FirstChild;
while Assigned(ACell) do
begin
AName:= ACell.NodeName;
if (AName = 'c') or (AName = 'x:c') then
begin
Text+= #26;
ParseCell(ACell, Strings, Styles, Text);
end;
ACell:= ACell.NextSibling;
end;
Text+= LineEnding;
end;
ARow:= ARow.NextSibling;
end;
end;
ADoc.Free;
end;
end;
finally
AStream.Free;
end;
end;
end;
function LoadFromExcel(const FileName: String; out AText: String): Boolean;
var
Index: Integer;
ZipFile: unzFile;
Styles: TIntegerMap;
Storage: TsNumFormatList;
Sheets, Strings: TStringList;
begin
Result:= False;
Sheets:= TStringList.Create;
Styles:= TIntegerMap.Create;
Strings:= TStringList.Create;
Storage:= TsNumFormatList.Create(FormatSettings, True);
try
ZipFile:= unzOpen(PAnsiChar(FileName));
if Assigned(ZipFile) then
try
if ParseWorkbook(ZipFile, Sheets) then
begin
AText:= EmptyStr;
ParseSharedStrings(ZipFile, Strings);
ParseStyles(ZipFile, Styles, Storage);
for Index:= 0 to Sheets.Count - 1 do
begin
AText+= Sheets[Index] + LineEnding;
ParseSheet(ZipFile, Index + 1, Strings, Styles, AText);
end;
Result:= Length(AText) > 0;
end;
finally
unzClose(ZipFile);
end;
finally
Sheets.Free;
Styles.Free;
Strings.Free;
Storage.Free;
end;
end;
function LoadFromOffice(const FileName: String; out AText: String): Boolean;
begin
if SameText(ExtractFileExt(FileName), '.docx') then
Result:= LoadFromOfficeOpen(FileName, AText)
else if SameText(ExtractFileExt(FileName), '.xlsx') then
Result:= LoadFromExcel(FileName, AText)
else
Result:= LoadFromOpenOffice(FileName, AText);
end;
initialization
OfficeMask:= TMaskList.Create('*.docx;*.xlsx;*.odt;*.ods');
finalization
OfficeMask.Free;
end.