mirror of
https://github.com/doublecmd/doublecmd.git
synced 2026-06-21 09:58:13 +00:00
UPD: Charset Detector
This commit is contained in:
parent
135cbffab4
commit
877ea0897a
26 changed files with 539 additions and 1246 deletions
|
|
@ -1,13 +1,13 @@
|
|||
-----------Summary
|
||||
Charset Detector - as the name says - is a stand alone executable module for automatic charset detection of a given text.
|
||||
Charset Detector - as the name says - is a stand alone component for automatic charset detection of a given text.
|
||||
It can be useful for internationalisation support in multilingual applications such as web-script editors or Unicode editors.
|
||||
Given input buffer will be analysed to guess used encoding. The result can be used as control parameter for charset conversation procedure.
|
||||
Charset Detector can be compiled (and hopefully used) for MS Windows (as dll - dynamic link library) or Linux.
|
||||
Based on Mozilla's i18n component - http://www.mozilla.org/projects/intl/.
|
||||
Based on Mozilla's i18n component - https://dxr.mozilla.org/mozilla/source/extensions/universalchardet/.
|
||||
|
||||
-----------State
|
||||
Version 0.2.6 stable.
|
||||
The latest version can be found at http://chsdet.sourceforge.net.
|
||||
Version 0.2.9 stable.
|
||||
The original version can be found at http://chsdet.sourceforge.net.
|
||||
https://sourceforge.net/p/doublecmd/code/HEAD/tree/trunk/components/chsdet/.
|
||||
|
||||
-----------Requirements
|
||||
Charset Detector doesn't need any external components.
|
||||
|
|
@ -16,8 +16,8 @@ Charset Detector doesn't need any external components.
|
|||
As result you will get guessed charset as MS Windows Code Page id and charset name.
|
||||
|
||||
-----------Licence
|
||||
Charset Detector is open source project and distributed under Lesser GPL.
|
||||
See the GNU Lesser General Public License for more details - http://www.opensource.org/licenses/lgpl-license.php
|
||||
Charset Detector is open source project and distributed under GNU LGPL.
|
||||
See the GNU Lesser General Public License for more details - https://opensource.org/licenses/LGPL-2.1
|
||||
|
||||
-----------Supported charsets
|
||||
|
||||
|
|
@ -37,11 +37,9 @@ See the GNU Lesser General Public License for more details - http://www.opensour
|
|||
| 1255 | windows-1255 | |
|
||||
| 10007 | x-mac-cyrillic | |
|
||||
| 12000 | X-ISO-10646-UCS-4-2143 | |
|
||||
| 12000 | UTF-32LE | MS Windows hasn't CP.|
|
||||
| | | Try to use USC-4. |
|
||||
| 12000 | UTF-32LE | |
|
||||
| 12001 | X-ISO-10646-UCS-4-3412 | |
|
||||
| 12001 | UTF-32BE | MS Windows hasn't CP.|
|
||||
| | | Try to use USC-4. |
|
||||
| 12001 | UTF-32BE | |
|
||||
| 20866 | KOI8-R | |
|
||||
| 28595 | ISO-8859-5 | |
|
||||
| 28595 | ISO-8859-5 | |
|
||||
|
|
@ -57,7 +55,7 @@ See the GNU Lesser General Public License for more details - http://www.opensour
|
|||
| 54936 | GB18030 | |
|
||||
| 65001 | UTF-8 | |
|
||||
+-----------+---------------------------+------------------------+
|
||||
|
||||
|
||||
-----------Types
|
||||
Return values
|
||||
|
||||
|
|
@ -67,76 +65,29 @@ Return values
|
|||
Returned types
|
||||
|
||||
rCharsetInfo = record
|
||||
Name: pChar; // charset GNU canonical name
|
||||
CodePage: integer; // MS Windows CodePage id
|
||||
Language: pChar; //
|
||||
Name: PAnsiChar; // Charset GNU canonical name
|
||||
CodePage: Integer; // MS Windows CodePage ID
|
||||
Language: PAnsiChar;
|
||||
end;
|
||||
|
||||
rAboutHolder = record
|
||||
MajorVersionNr: Cardinal; // Library's Major Version #
|
||||
MinorVersionNr: Cardinal; // Library's Minor Version #
|
||||
BuildVersionNr: Cardinal; // Library's Build/Release Version #
|
||||
About: pChar; // Copyleft information;
|
||||
-----------Usage sample
|
||||
|
||||
Below is a small usage sample in Free Pascal.
|
||||
|
||||
function DetectEncoding(const S: String): rCharsetInfo;
|
||||
var
|
||||
Detector: TnsUniversalDetector;
|
||||
begin
|
||||
Detector:= TnsUniversalDetector.Create;
|
||||
try
|
||||
Detector.Reset;
|
||||
Detector.HandleData(PAnsiChar(S), Length(S));
|
||||
if not Detector.Done then Detector.DataEnd;
|
||||
Result:= Detector.GetDetectedCharsetInfo;
|
||||
finally
|
||||
FreeAndNil(Detector);
|
||||
end;
|
||||
end;
|
||||
|
||||
-----------Exported functions
|
||||
procedure chsd_Reset; stdcall;
|
||||
Reset Charset Detector state. Prepare to new analyse.
|
||||
|
||||
function chsd_HandleData(aBuf: PChar; aLen: integer): integer; stdcall;
|
||||
Analyse given buffer.
|
||||
Parameters
|
||||
aBuf - pointer to buffer with text.
|
||||
sLen - buffer length;
|
||||
Return value
|
||||
NS_ERROR_OUT_OF_MEMORY - failure. Unable to create internal objects.
|
||||
NS_OK - success.
|
||||
Note
|
||||
Function can be called more that one time to continue guessing. Charset Detector
|
||||
remember last state until chsd_Reset called.
|
||||
|
||||
function chsd_Done: Boolean; stdcall;
|
||||
Return value
|
||||
TRUE - Charset Detector is sure about text encoding.
|
||||
FALSE - Overwise.
|
||||
Note
|
||||
If input buffer is smaller then 1K Charset Detector returns anyway FALSE.
|
||||
|
||||
procedure chsd_DataEnd; stdcall;
|
||||
Signalise data end. If Charset Detector hasn't sure result (Done = FALSE)
|
||||
the best guessed encoding will be set as result.
|
||||
|
||||
function chsd_GetDetectedCharset: rCharsetInfo; stdcall;
|
||||
Returns guessed charset.
|
||||
|
||||
procedure chsd_GetKnownCharsets(var KnownCharsets: pChar);
|
||||
Fills the parameter with all supported charsets in form
|
||||
"CodePage - Name LineFeed".
|
||||
|
||||
procedure chsd_GetAbout(var About: rAboutHolder); stdcall;
|
||||
Fills the parameter with version and copyleft information.
|
||||
|
||||
-----------Sample
|
||||
The definition file "chsd_dll_intf.pas" can be found in the same direcory.
|
||||
Bellow is small usage sample.
|
||||
|
||||
// WS: WideString; // Wide string which can be used in Unicode controls.
|
||||
|
||||
// Get encoding of some buffer
|
||||
chsd_Reset;
|
||||
chsd_HandleData(aBuf, aLen);
|
||||
|
||||
if not chsd_Done then
|
||||
chsd_DataEnd;
|
||||
|
||||
ChSInfo := chsd_GetDetectedCharset();
|
||||
|
||||
// convert buffer to WideString
|
||||
OutputLength := MultiByteToWideChar(ChSInfo.CodePage, 0, aBuf, aLen, nil, 0);
|
||||
SetLength(WS, OutputLength);
|
||||
MultiByteToWideChar(ChSInfo.CodePage, 0, aBuf, aLen, PWideChar(WS), OutputLength);
|
||||
|
||||
// If you using Unicode SynEdit
|
||||
SynEdit.Lines.Text := WS;
|
||||
|
||||
Nikolaj Yakowlew © 2006-2008
|
||||
Copyright (C) 2006-2013 Nikolaj Yakowlew
|
||||
Copyright (C) 2011-2019 Alexander Koblov
|
||||
|
|
|
|||
|
|
@ -1,70 +0,0 @@
|
|||
// +----------------------------------------------------------------------+
|
||||
// | chsdet - Charset Detector Library |
|
||||
// +----------------------------------------------------------------------+
|
||||
// | Copyright (C) 2006, Nick Yakowlew http://chsdet.sourceforge.net |
|
||||
// +----------------------------------------------------------------------+
|
||||
// | Based on Mozilla sources http://www.mozilla.org/projects/intl/ |
|
||||
// +----------------------------------------------------------------------+
|
||||
// | This library is free software; you can redistribute it and/or modify |
|
||||
// | it under the terms of the GNU General Public License as published by |
|
||||
// | the Free Software Foundation; either version 2 of the License, or |
|
||||
// | (at your option) any later version. |
|
||||
// | This library is distributed in the hope that it will be useful |
|
||||
// | but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
||||
// | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
|
||||
// | See the GNU Lesser General Public License for more details. |
|
||||
// | http://www.opensource.org/licenses/lgpl-license.php |
|
||||
// +----------------------------------------------------------------------+
|
||||
//
|
||||
// $Id: chsd_dll_intf.pas,v 1.4 2009/07/12 15:13:56 ya_nick Exp $
|
||||
|
||||
unit chsd_dll_intf;
|
||||
|
||||
interface
|
||||
|
||||
const
|
||||
NS_OK = 0;
|
||||
NS_ERROR_OUT_OF_MEMORY = $8007000e;
|
||||
|
||||
type
|
||||
rCharsetInfo = record
|
||||
Name: pChar;
|
||||
CodePage: integer;
|
||||
Language: pChar;
|
||||
end;
|
||||
prCharsetInfo = ^rCharsetInfo;
|
||||
|
||||
rAboutHolder = record
|
||||
MajorVersionNr: Cardinal;
|
||||
MinorVersionNr: Cardinal;
|
||||
BuildVersionNr: Cardinal;
|
||||
About: pChar;
|
||||
end;
|
||||
|
||||
eBOMKind =(
|
||||
BOM_Not_Found,
|
||||
BOM_UCS4_BE, // 00 00 FE FF UCS-4, big-endian machine (1234 order)
|
||||
BOM_UCS4_LE, // FF FE 00 00 UCS-4, little-endian machine (4321 order)
|
||||
BOM_UCS4_2143, // 00 00 FF FE UCS-4, unusual octet order (2143)
|
||||
BOM_UCS4_3412, // FE FF 00 00 UCS-4, unusual octet order (3412)
|
||||
BOM_UTF16_BE, // FE FF ## ## UTF-16, big-endian
|
||||
BOM_UTF16_LE, // FF FE ## ## UTF-16, little-endian
|
||||
BOM_UTF8 // EF BB BF UTF-8
|
||||
);
|
||||
|
||||
const
|
||||
CharsetDetectorLibrary = 'chsdet.dll';
|
||||
|
||||
procedure csd_Reset; stdcall; external CharsetDetectorLibrary;
|
||||
function csd_HandleData(aBuf: PChar; aLen: integer): integer; stdcall; external CharsetDetectorLibrary;
|
||||
function csd_Done: Boolean; stdcall; external CharsetDetectorLibrary;
|
||||
procedure csd_DataEnd; stdcall; external CharsetDetectorLibrary;
|
||||
function csd_GetDetectedCharset: rCharsetInfo; stdcall; external CharsetDetectorLibrary;
|
||||
procedure csd_GetKnownCharsets(var KnownCharsets: pChar); stdcall; external CharsetDetectorLibrary;
|
||||
procedure csd_GetAbout(var About: rAboutHolder); stdcall; external CharsetDetectorLibrary;
|
||||
function csd_GetDetectedBOM: eBOMKind; stdcall; external CharsetDetectorLibrary;
|
||||
procedure csd_DisableCharsetCP(CodePage: integer); stdcall; external CharsetDetectorLibrary;
|
||||
|
||||
implementation
|
||||
|
||||
end.
|
||||
|
|
@ -1,227 +1,225 @@
|
|||
<?xml version="1.0"?>
|
||||
<CONFIG>
|
||||
<Package Version="4">
|
||||
<PathDelim Value="\"/>
|
||||
<Name Value="chsdet"/>
|
||||
<CompilerOptions>
|
||||
<Version Value="11"/>
|
||||
<PathDelim Value="\"/>
|
||||
<SearchPaths>
|
||||
<OtherUnitFiles Value="src;src\sbseq"/>
|
||||
<UnitOutputDirectory Value="lib\$(TargetCPU)-$(TargetOS)"/>
|
||||
</SearchPaths>
|
||||
<CodeGeneration>
|
||||
<Checks>
|
||||
<RangeChecks Value="True"/>
|
||||
<OverflowChecks Value="True"/>
|
||||
</Checks>
|
||||
</CodeGeneration>
|
||||
<Linking>
|
||||
<Debugging>
|
||||
<DebugInfoType Value="dsDwarf2Set"/>
|
||||
</Debugging>
|
||||
</Linking>
|
||||
<Other>
|
||||
<CompilerPath Value="$(CompPath)"/>
|
||||
</Other>
|
||||
</CompilerOptions>
|
||||
<Files Count="46">
|
||||
<Item1>
|
||||
<Filename Value="src\Big5Freq.pas"/>
|
||||
<UnitName Value="Big5Freq"/>
|
||||
</Item1>
|
||||
<Item2>
|
||||
<Filename Value="src\CharDistribution.pas"/>
|
||||
<UnitName Value="CharDistribution"/>
|
||||
</Item2>
|
||||
<Item3>
|
||||
<Filename Value="src\chsdIntf.pas"/>
|
||||
<UnitName Value="chsdIntf"/>
|
||||
</Item3>
|
||||
<Item4>
|
||||
<Filename Value="src\CustomDetector.pas"/>
|
||||
<UnitName Value="CustomDetector"/>
|
||||
</Item4>
|
||||
<Item5>
|
||||
<Filename Value="src\dbg.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item5>
|
||||
<Item6>
|
||||
<Filename Value="src\EUCKRFreq.pas"/>
|
||||
<UnitName Value="EUCKRFreq"/>
|
||||
</Item6>
|
||||
<Item7>
|
||||
<Filename Value="src\EUCSampler.pas"/>
|
||||
<UnitName Value="EUCSampler"/>
|
||||
</Item7>
|
||||
<Item8>
|
||||
<Filename Value="src\EUCTWFreq.pas"/>
|
||||
<UnitName Value="EUCTWFreq"/>
|
||||
</Item8>
|
||||
<Item9>
|
||||
<Filename Value="src\GB2312Freq.pas"/>
|
||||
<UnitName Value="GB2312Freq"/>
|
||||
</Item9>
|
||||
<Item10>
|
||||
<Filename Value="src\JISFreq.pas"/>
|
||||
<UnitName Value="JISFreq"/>
|
||||
</Item10>
|
||||
<Item11>
|
||||
<Filename Value="src\JpCntx.pas"/>
|
||||
<UnitName Value="JpCntx"/>
|
||||
</Item11>
|
||||
<Item12>
|
||||
<Filename Value="src\mbclass\Big5LangModel.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item12>
|
||||
<Item13>
|
||||
<Filename Value="src\mbclass\EUCJPLangModel.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item13>
|
||||
<Item14>
|
||||
<Filename Value="src\mbclass\EUCKRLangModel.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item14>
|
||||
<Item15>
|
||||
<Filename Value="src\mbclass\EUCTWLangModel.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item15>
|
||||
<Item16>
|
||||
<Filename Value="src\mbclass\GB18030LangModel.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item16>
|
||||
<Item17>
|
||||
<Filename Value="src\mbclass\HZLangModel.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item17>
|
||||
<Item18>
|
||||
<Filename Value="src\mbclass\ISO2022CNLangModel.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item18>
|
||||
<Item19>
|
||||
<Filename Value="src\mbclass\ISO2022JPLangModel.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item19>
|
||||
<Item20>
|
||||
<Filename Value="src\mbclass\ISO2022KRLangModel.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item20>
|
||||
<Item21>
|
||||
<Filename Value="src\mbclass\SJISLangModel.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item21>
|
||||
<Item22>
|
||||
<Filename Value="src\mbclass\UCS2BELangModel.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item22>
|
||||
<Item23>
|
||||
<Filename Value="src\mbclass\UCS2LELangModel.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item23>
|
||||
<Item24>
|
||||
<Filename Value="src\mbclass\UTF8LangModel.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item24>
|
||||
<Item25>
|
||||
<Filename Value="src\MBUnicodeMultiProber.pas"/>
|
||||
<UnitName Value="MBUnicodeMultiProber"/>
|
||||
</Item25>
|
||||
<Item26>
|
||||
<Filename Value="src\MultiModelProber.pas"/>
|
||||
<UnitName Value="MultiModelProber"/>
|
||||
</Item26>
|
||||
<Item27>
|
||||
<Filename Value="src\nsCodingStateMachine.pas"/>
|
||||
<UnitName Value="nsCodingStateMachine"/>
|
||||
</Item27>
|
||||
<Item28>
|
||||
<Filename Value="src\nsCore.pas"/>
|
||||
<UnitName Value="nsCore"/>
|
||||
</Item28>
|
||||
<Item29>
|
||||
<Filename Value="src\nsEscCharsetProber.pas"/>
|
||||
<UnitName Value="nsEscCharsetProber"/>
|
||||
</Item29>
|
||||
<Item30>
|
||||
<Filename Value="src\nsGroupProber.pas"/>
|
||||
<UnitName Value="nsGroupProber"/>
|
||||
</Item30>
|
||||
<Item31>
|
||||
<Filename Value="src\nsHebrewProber.pas"/>
|
||||
<UnitName Value="nsHebrewProber"/>
|
||||
</Item31>
|
||||
<Item32>
|
||||
<Filename Value="src\nsLatin1Prober.pas"/>
|
||||
<UnitName Value="nsLatin1Prober"/>
|
||||
</Item32>
|
||||
<Item33>
|
||||
<Filename Value="src\nsMBCSMultiProber.pas"/>
|
||||
<UnitName Value="nsMBCSMultiProber"/>
|
||||
</Item33>
|
||||
<Item34>
|
||||
<Filename Value="src\nsPkg.pas"/>
|
||||
<UnitName Value="nsPkg"/>
|
||||
</Item34>
|
||||
<Item35>
|
||||
<Filename Value="src\nsSBCharSetProber.pas"/>
|
||||
<UnitName Value="nsSBCharSetProber"/>
|
||||
</Item35>
|
||||
<Item36>
|
||||
<Filename Value="src\nsSBCSGroupProber.pas"/>
|
||||
<UnitName Value="nsSBCSGroupProber"/>
|
||||
</Item36>
|
||||
<Item37>
|
||||
<Filename Value="src\nsUniversalDetector.pas"/>
|
||||
<UnitName Value="nsUniversalDetector"/>
|
||||
</Item37>
|
||||
<Item38>
|
||||
<Filename Value="src\sbseq\LangBulgarianModel.pas"/>
|
||||
<UnitName Value="LangBulgarianModel"/>
|
||||
</Item38>
|
||||
<Item39>
|
||||
<Filename Value="src\sbseq\LangCyrillicModel.pas"/>
|
||||
<UnitName Value="LangCyrillicModel"/>
|
||||
</Item39>
|
||||
<Item40>
|
||||
<Filename Value="src\sbseq\LangGreekModel.pas"/>
|
||||
<UnitName Value="LangGreekModel"/>
|
||||
</Item40>
|
||||
<Item41>
|
||||
<Filename Value="src\sbseq\LangHebrewModel.pas"/>
|
||||
<UnitName Value="LangHebrewModel"/>
|
||||
</Item41>
|
||||
<Item42>
|
||||
<Filename Value="src\stat\Big5Statistics.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item42>
|
||||
<Item43>
|
||||
<Filename Value="src\stat\EUCJPStatistics.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item43>
|
||||
<Item44>
|
||||
<Filename Value="src\stat\EUCKRStatistics.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item44>
|
||||
<Item45>
|
||||
<Filename Value="src\stat\EUCTWStatistics.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item45>
|
||||
<Item46>
|
||||
<Filename Value="src\stat\GB2312Statistics.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item46>
|
||||
</Files>
|
||||
<RequiredPkgs Count="1">
|
||||
<Item1>
|
||||
<PackageName Value="FCL"/>
|
||||
<MinVersion Major="1" Valid="True"/>
|
||||
</Item1>
|
||||
</RequiredPkgs>
|
||||
<UsageOptions>
|
||||
<UnitPath Value="$(PkgOutDir)"/>
|
||||
</UsageOptions>
|
||||
<PublishOptions>
|
||||
<Version Value="2"/>
|
||||
</PublishOptions>
|
||||
</Package>
|
||||
</CONFIG>
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<CONFIG>
|
||||
<Package Version="4">
|
||||
<PathDelim Value="\"/>
|
||||
<Name Value="chsdet"/>
|
||||
<Author Value="Nikolaj Yakowlew, Alexander Koblov"/>
|
||||
<CompilerOptions>
|
||||
<Version Value="11"/>
|
||||
<PathDelim Value="\"/>
|
||||
<SearchPaths>
|
||||
<OtherUnitFiles Value="src;src\sbseq"/>
|
||||
<UnitOutputDirectory Value="lib\$(TargetCPU)-$(TargetOS)"/>
|
||||
</SearchPaths>
|
||||
<CodeGeneration>
|
||||
<Checks>
|
||||
<RangeChecks Value="True"/>
|
||||
<OverflowChecks Value="True"/>
|
||||
</Checks>
|
||||
</CodeGeneration>
|
||||
<Linking>
|
||||
<Debugging>
|
||||
<DebugInfoType Value="dsDwarf2Set"/>
|
||||
</Debugging>
|
||||
</Linking>
|
||||
</CompilerOptions>
|
||||
<Description Value="Charset Detector - as the name says - is a stand alone component for automatic charset detection of a given text.
|
||||
Given input buffer will be analysed to guess used encoding. The result can be used as control parameter for charset conversation procedure."/>
|
||||
<License Value="GNU LGPL-2.1"/>
|
||||
<Version Minor="2" Release="9"/>
|
||||
<Files Count="45">
|
||||
<Item1>
|
||||
<Filename Value="src\Big5Freq.pas"/>
|
||||
<UnitName Value="Big5Freq"/>
|
||||
</Item1>
|
||||
<Item2>
|
||||
<Filename Value="src\CharDistribution.pas"/>
|
||||
<UnitName Value="CharDistribution"/>
|
||||
</Item2>
|
||||
<Item3>
|
||||
<Filename Value="src\CustomDetector.pas"/>
|
||||
<UnitName Value="CustomDetector"/>
|
||||
</Item3>
|
||||
<Item4>
|
||||
<Filename Value="src\dbg.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item4>
|
||||
<Item5>
|
||||
<Filename Value="src\EUCKRFreq.pas"/>
|
||||
<UnitName Value="EUCKRFreq"/>
|
||||
</Item5>
|
||||
<Item6>
|
||||
<Filename Value="src\EUCSampler.pas"/>
|
||||
<UnitName Value="EUCSampler"/>
|
||||
</Item6>
|
||||
<Item7>
|
||||
<Filename Value="src\EUCTWFreq.pas"/>
|
||||
<UnitName Value="EUCTWFreq"/>
|
||||
</Item7>
|
||||
<Item8>
|
||||
<Filename Value="src\GB2312Freq.pas"/>
|
||||
<UnitName Value="GB2312Freq"/>
|
||||
</Item8>
|
||||
<Item9>
|
||||
<Filename Value="src\JISFreq.pas"/>
|
||||
<UnitName Value="JISFreq"/>
|
||||
</Item9>
|
||||
<Item10>
|
||||
<Filename Value="src\JpCntx.pas"/>
|
||||
<UnitName Value="JpCntx"/>
|
||||
</Item10>
|
||||
<Item11>
|
||||
<Filename Value="src\mbclass\Big5LangModel.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item11>
|
||||
<Item12>
|
||||
<Filename Value="src\mbclass\EUCJPLangModel.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item12>
|
||||
<Item13>
|
||||
<Filename Value="src\mbclass\EUCKRLangModel.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item13>
|
||||
<Item14>
|
||||
<Filename Value="src\mbclass\EUCTWLangModel.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item14>
|
||||
<Item15>
|
||||
<Filename Value="src\mbclass\GB18030LangModel.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item15>
|
||||
<Item16>
|
||||
<Filename Value="src\mbclass\HZLangModel.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item16>
|
||||
<Item17>
|
||||
<Filename Value="src\mbclass\ISO2022CNLangModel.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item17>
|
||||
<Item18>
|
||||
<Filename Value="src\mbclass\ISO2022JPLangModel.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item18>
|
||||
<Item19>
|
||||
<Filename Value="src\mbclass\ISO2022KRLangModel.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item19>
|
||||
<Item20>
|
||||
<Filename Value="src\mbclass\SJISLangModel.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item20>
|
||||
<Item21>
|
||||
<Filename Value="src\mbclass\UCS2BELangModel.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item21>
|
||||
<Item22>
|
||||
<Filename Value="src\mbclass\UCS2LELangModel.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item22>
|
||||
<Item23>
|
||||
<Filename Value="src\mbclass\UTF8LangModel.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item23>
|
||||
<Item24>
|
||||
<Filename Value="src\MBUnicodeMultiProber.pas"/>
|
||||
<UnitName Value="MBUnicodeMultiProber"/>
|
||||
</Item24>
|
||||
<Item25>
|
||||
<Filename Value="src\MultiModelProber.pas"/>
|
||||
<UnitName Value="MultiModelProber"/>
|
||||
</Item25>
|
||||
<Item26>
|
||||
<Filename Value="src\nsCodingStateMachine.pas"/>
|
||||
<UnitName Value="nsCodingStateMachine"/>
|
||||
</Item26>
|
||||
<Item27>
|
||||
<Filename Value="src\nsCore.pas"/>
|
||||
<UnitName Value="nsCore"/>
|
||||
</Item27>
|
||||
<Item28>
|
||||
<Filename Value="src\nsEscCharsetProber.pas"/>
|
||||
<UnitName Value="nsEscCharsetProber"/>
|
||||
</Item28>
|
||||
<Item29>
|
||||
<Filename Value="src\nsGroupProber.pas"/>
|
||||
<UnitName Value="nsGroupProber"/>
|
||||
</Item29>
|
||||
<Item30>
|
||||
<Filename Value="src\nsHebrewProber.pas"/>
|
||||
<UnitName Value="nsHebrewProber"/>
|
||||
</Item30>
|
||||
<Item31>
|
||||
<Filename Value="src\nsLatin1Prober.pas"/>
|
||||
<UnitName Value="nsLatin1Prober"/>
|
||||
</Item31>
|
||||
<Item32>
|
||||
<Filename Value="src\nsMBCSMultiProber.pas"/>
|
||||
<UnitName Value="nsMBCSMultiProber"/>
|
||||
</Item32>
|
||||
<Item33>
|
||||
<Filename Value="src\nsPkg.pas"/>
|
||||
<UnitName Value="nsPkg"/>
|
||||
</Item33>
|
||||
<Item34>
|
||||
<Filename Value="src\nsSBCharSetProber.pas"/>
|
||||
<UnitName Value="nsSBCharSetProber"/>
|
||||
</Item34>
|
||||
<Item35>
|
||||
<Filename Value="src\nsSBCSGroupProber.pas"/>
|
||||
<UnitName Value="nsSBCSGroupProber"/>
|
||||
</Item35>
|
||||
<Item36>
|
||||
<Filename Value="src\nsUniversalDetector.pas"/>
|
||||
<UnitName Value="nsUniversalDetector"/>
|
||||
</Item36>
|
||||
<Item37>
|
||||
<Filename Value="src\sbseq\LangBulgarianModel.pas"/>
|
||||
<UnitName Value="LangBulgarianModel"/>
|
||||
</Item37>
|
||||
<Item38>
|
||||
<Filename Value="src\sbseq\LangCyrillicModel.pas"/>
|
||||
<UnitName Value="LangCyrillicModel"/>
|
||||
</Item38>
|
||||
<Item39>
|
||||
<Filename Value="src\sbseq\LangGreekModel.pas"/>
|
||||
<UnitName Value="LangGreekModel"/>
|
||||
</Item39>
|
||||
<Item40>
|
||||
<Filename Value="src\sbseq\LangHebrewModel.pas"/>
|
||||
<UnitName Value="LangHebrewModel"/>
|
||||
</Item40>
|
||||
<Item41>
|
||||
<Filename Value="src\stat\Big5Statistics.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item41>
|
||||
<Item42>
|
||||
<Filename Value="src\stat\EUCJPStatistics.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item42>
|
||||
<Item43>
|
||||
<Filename Value="src\stat\EUCKRStatistics.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item43>
|
||||
<Item44>
|
||||
<Filename Value="src\stat\EUCTWStatistics.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item44>
|
||||
<Item45>
|
||||
<Filename Value="src\stat\GB2312Statistics.inc"/>
|
||||
<Type Value="Include"/>
|
||||
</Item45>
|
||||
</Files>
|
||||
<RequiredPkgs Count="1">
|
||||
<Item1>
|
||||
<PackageName Value="FCL"/>
|
||||
<MinVersion Major="1" Valid="True"/>
|
||||
</Item1>
|
||||
</RequiredPkgs>
|
||||
<UsageOptions>
|
||||
<UnitPath Value="$(PkgOutDir)"/>
|
||||
</UsageOptions>
|
||||
<PublishOptions>
|
||||
<Version Value="2"/>
|
||||
</PublishOptions>
|
||||
</Package>
|
||||
</CONFIG>
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ unit chsdet;
|
|||
interface
|
||||
|
||||
uses
|
||||
Big5Freq, CharDistribution, chsdIntf, CustomDetector, EUCKRFreq, EUCSampler,
|
||||
Big5Freq, CharDistribution, CustomDetector, EUCKRFreq, EUCSampler,
|
||||
EUCTWFreq, GB2312Freq, JISFreq, JpCntx, MBUnicodeMultiProber,
|
||||
MultiModelProber, nsCodingStateMachine, nsCore, nsEscCharsetProber,
|
||||
nsGroupProber, nsHebrewProber, nsLatin1Prober, nsMBCSMultiProber, nsPkg,
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@
|
|||
// | http://www.opensource.org/licenses/lgpl-license.php |
|
||||
// +----------------------------------------------------------------------+
|
||||
//
|
||||
// $Id: Big5Freq.pas,v 1.2 2007/05/20 15:46:02 ya_nick Exp $
|
||||
// $Id: Big5Freq.pas,v 1.3 2013/04/23 19:47:10 ya_nick Exp $
|
||||
|
||||
unit Big5Freq;
|
||||
|
||||
|
|
@ -49,7 +49,7 @@ const
|
|||
//Char to FreqOrder table ,
|
||||
BIG5_TABLE_SIZE = 5376;
|
||||
|
||||
Big5CharToFreqOrder: array [0..BIG5_TABLE_SIZE-1] of PRInt16 =
|
||||
Big5CharToFreqOrder: array [0..BIG5_TABLE_SIZE-1] of int16 =
|
||||
(
|
||||
1,1801,1506, 255,1431, 198, 9, 82, 6,5008, 177, 202,3681,1256,2821, 110, // 16
|
||||
3814, 33,3274, 261, 76, 44,2114, 16,2946,2187,1176, 659,3971, 26,3451,2653, // 32
|
||||
|
|
@ -933,4 +933,4 @@ const
|
|||
****************************************************************************************)
|
||||
);
|
||||
implementation
|
||||
end.
|
||||
end.
|
||||
|
|
@ -16,7 +16,7 @@
|
|||
// | http://www.opensource.org/licenses/lgpl-license.php |
|
||||
// +----------------------------------------------------------------------+
|
||||
//
|
||||
// $Id: CharDistribution.pas,v 1.3 2007/05/26 13:09:38 ya_nick Exp $
|
||||
// $Id: CharDistribution.pas,v 1.4 2013/04/23 19:47:10 ya_nick Exp $
|
||||
|
||||
unit CharDistribution;
|
||||
|
||||
|
|
@ -32,13 +32,13 @@ type
|
|||
protected
|
||||
//mDone: PRBool; (*If this flag is set to PR_TRUE, detection is done and conclusion has been made*)
|
||||
// YaN: nice idea. Unfortunately is not implemented :((
|
||||
mFreqChars: PRUint32; (*The number of characters whose frequency order is less than 512*)
|
||||
mTotalChars: PRUint32; (*Total character encounted.*)
|
||||
mFreqChars: uInt32; (*The number of characters whose frequency order is less than 512*)
|
||||
mTotalChars: uInt32; (*Total character encounted.*)
|
||||
|
||||
mCharToFreqOrder: pPRInt16; (*Mapping table to get frequency order from char order
|
||||
mCharToFreqOrder: pInt16; (*Mapping table to get frequency order from char order
|
||||
(get from GetOrder())*)
|
||||
|
||||
mTableSize: PRUint32; (*Size of above table*)
|
||||
mTableSize: uInt32; (*Size of above table*)
|
||||
mTypicalDistributionRatio: double;(*This is a constant value varies from language to language,
|
||||
it is used in calculating confidence.
|
||||
See my paper for further detail.*)
|
||||
|
|
@ -48,9 +48,9 @@ type
|
|||
//we do not handle character base on its original encoding string, but
|
||||
//convert this encoding string to a number, here called order.
|
||||
//This allow multiple encoding of a language to share one frequency table
|
||||
function GetOrder(str: PChar): PRInt32; virtual; abstract;
|
||||
function GetOrder(str: pAnsiChar): int32; virtual; abstract;
|
||||
(*feed a block of data and do distribution analysis*)
|
||||
// function HandleData(const aBuf: PChar; aLen: PRUint32): eProbingState; virtual; abstract;
|
||||
// function HandleData(const aBuf: pAnsiChar; aLen: uInt32): eProbingState; virtual; abstract;
|
||||
public
|
||||
destructor Destroy; override;
|
||||
(*This function is for future extension.
|
||||
|
|
@ -66,7 +66,7 @@ type
|
|||
function GotEnoughData: Boolean;
|
||||
|
||||
(*Feed a character with known length*)
|
||||
procedure HandleOneChar(aStr: PChar; aCharLen: PRUint32); virtual;
|
||||
procedure HandleOneChar(aStr: pAnsiChar; aCharLen: uInt32); virtual;
|
||||
|
||||
end;
|
||||
|
||||
|
|
@ -76,7 +76,7 @@ type
|
|||
(* second byte range: 0xa1 -- 0xfe*)
|
||||
(*no validation needed here. State machine has done that*)
|
||||
protected
|
||||
function GetOrder(str: PChar): PRInt32; override;
|
||||
function GetOrder(str: pAnsiChar): int32; override;
|
||||
public
|
||||
constructor Create; reintroduce;
|
||||
end;
|
||||
|
|
@ -87,7 +87,7 @@ type
|
|||
(* second byte range: 0xa1 -- 0xfe*)
|
||||
(*no validation needed here. State machine has done that*)
|
||||
protected
|
||||
function GetOrder(str: PChar): PRInt32; override;
|
||||
function GetOrder(str: pAnsiChar): int32; override;
|
||||
public
|
||||
constructor Create; reintroduce;
|
||||
end;
|
||||
|
|
@ -98,7 +98,7 @@ type
|
|||
(* second byte range: 0xa1 -- 0xfe*)
|
||||
(*no validation needed here. State machine has done that*)
|
||||
protected
|
||||
function GetOrder(str: PChar): PRInt32; override;
|
||||
function GetOrder(str: pAnsiChar): int32; override;
|
||||
public
|
||||
constructor Create; reintroduce;
|
||||
end;
|
||||
|
|
@ -109,7 +109,7 @@ type
|
|||
(* second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe*)
|
||||
(*no validation needed here. State machine has done that*)
|
||||
protected
|
||||
function GetOrder(str: PChar): PRInt32; override;
|
||||
function GetOrder(str: pAnsiChar): int32; override;
|
||||
public
|
||||
constructor Create; reintroduce;
|
||||
end;
|
||||
|
|
@ -120,7 +120,7 @@ type
|
|||
(* second byte range: 0x40 -- 0x7e, 0x81 -- oxfe*)
|
||||
(*no validation needed here. State machine has done that*)
|
||||
protected
|
||||
function GetOrder(str: PChar): PRInt32; override;
|
||||
function GetOrder(str: pAnsiChar): int32; override;
|
||||
public
|
||||
constructor Create; reintroduce;
|
||||
end;
|
||||
|
|
@ -131,7 +131,7 @@ type
|
|||
(* second byte range: 0xa1 -- 0xfe*)
|
||||
(*no validation needed here. State machine has done that*)
|
||||
protected
|
||||
function GetOrder(str: PChar): PRInt32; override;
|
||||
function GetOrder(str: pAnsiChar): int32; override;
|
||||
public
|
||||
constructor Create; reintroduce;
|
||||
end;
|
||||
|
|
@ -151,7 +151,7 @@ begin
|
|||
inherited;
|
||||
end;
|
||||
|
||||
procedure TCharDistributionAnalysis.HandleOneChar(aStr: PChar; aCharLen: PRUint32);
|
||||
procedure TCharDistributionAnalysis.HandleOneChar(aStr: pAnsiChar; aCharLen: uInt32);
|
||||
var
|
||||
order: integer;
|
||||
begin
|
||||
|
|
@ -165,7 +165,7 @@ begin
|
|||
inc(mTotalChars); (*order is valid*)
|
||||
if order < integer(mTableSize) then
|
||||
begin
|
||||
if 512 > aPRint16(mCharToFreqOrder)[order] then
|
||||
if 512 > aInt16(mCharToFreqOrder)[order] then
|
||||
inc(mFreqChars);
|
||||
end;
|
||||
end;
|
||||
|
|
@ -213,7 +213,7 @@ begin
|
|||
mTypicalDistributionRatio := EUCTW_TYPICAL_DISTRIBUTION_RATIO;
|
||||
end;
|
||||
|
||||
function TEUCTWDistributionAnalysis.GetOrder(str: PChar): PRInt32;
|
||||
function TEUCTWDistributionAnalysis.GetOrder(str: pAnsiChar): int32;
|
||||
begin
|
||||
if byte(str^) >= $c4 then
|
||||
Result := 94 * (byte(str[0]) - $c4) + byte(str[1]) - byte($a1)
|
||||
|
|
@ -229,7 +229,7 @@ begin
|
|||
mTypicalDistributionRatio := EUCKR_TYPICAL_DISTRIBUTION_RATIO;
|
||||
end;
|
||||
|
||||
function TEUCKRDistributionAnalysis.GetOrder(str: PChar): PRInt32;
|
||||
function TEUCKRDistributionAnalysis.GetOrder(str: pAnsiChar): int32;
|
||||
begin
|
||||
if byte(str^) >= $b0 then
|
||||
Result := 94 * (byte(str[0]) - $b0) + byte(str[1]) - $a1
|
||||
|
|
@ -245,7 +245,7 @@ begin
|
|||
mTypicalDistributionRatio := GB2312_TYPICAL_DISTRIBUTION_RATIO;
|
||||
end;
|
||||
|
||||
function TGB2312DistributionAnalysis.GetOrder(str: PChar): PRInt32;
|
||||
function TGB2312DistributionAnalysis.GetOrder(str: pAnsiChar): int32;
|
||||
begin
|
||||
if (byte(str[0]) >= $b0) and
|
||||
(byte(str[1]) >= $a1) then
|
||||
|
|
@ -262,7 +262,7 @@ begin
|
|||
mTypicalDistributionRatio := BIG5_TYPICAL_DISTRIBUTION_RATIO;
|
||||
end;
|
||||
|
||||
function TBig5DistributionAnalysis.GetOrder(str: PChar): PRInt32;
|
||||
function TBig5DistributionAnalysis.GetOrder(str: pAnsiChar): int32;
|
||||
begin
|
||||
if byte(str[0]) >= $a4 then
|
||||
begin
|
||||
|
|
@ -283,9 +283,9 @@ begin
|
|||
mTypicalDistributionRatio := JIS_TYPICAL_DISTRIBUTION_RATIO;
|
||||
end;
|
||||
|
||||
function TSJISDistributionAnalysis.GetOrder(str: PChar): PRInt32;
|
||||
function TSJISDistributionAnalysis.GetOrder(str: pAnsiChar): int32;
|
||||
var
|
||||
order: PRInt32;
|
||||
order: int32;
|
||||
begin
|
||||
if (byte(str[0]) >= $81) and
|
||||
(byte(str[0]) <= $9f) then
|
||||
|
|
@ -313,7 +313,7 @@ begin
|
|||
mTypicalDistributionRatio := JIS_TYPICAL_DISTRIBUTION_RATIO;
|
||||
end;
|
||||
|
||||
function TEUCJPDistributionAnalysis.GetOrder(str: PChar): PRInt32;
|
||||
function TEUCJPDistributionAnalysis.GetOrder(str: pAnsiChar): int32;
|
||||
begin
|
||||
if byte(str[0]) >= $a0 then
|
||||
Result := 94 * (byte(str[0]) - $a1) + byte(str[1]) - $a1
|
||||
|
|
@ -321,7 +321,4 @@ begin
|
|||
Result:= -1;
|
||||
end;
|
||||
|
||||
end.
|
||||
|
||||
|
||||
|
||||
end.
|
||||
|
|
@ -1,30 +1,28 @@
|
|||
unit Dump;
|
||||
|
||||
interface
|
||||
const
|
||||
nl = #13#10;
|
||||
|
||||
var
|
||||
DumpStr: string;
|
||||
uses
|
||||
Classes
|
||||
;
|
||||
|
||||
procedure AddDump(Dump: string);
|
||||
procedure ShowDump;
|
||||
procedure SetDumpOutput(DumpOutput: TStrings);
|
||||
|
||||
implementation
|
||||
uses
|
||||
// Windows;
|
||||
UNIT1;
|
||||
|
||||
var
|
||||
_DumpOutput: TStrings = nil;
|
||||
|
||||
procedure SetDumpOutput(DumpOutput: TStrings);
|
||||
begin
|
||||
_DumpOutput := DumpOutput;
|
||||
end;
|
||||
|
||||
procedure AddDump(Dump: string);
|
||||
begin
|
||||
UNIT1.Form1.Memo1.Lines.Add(Dump);
|
||||
// DumpStr := DumpStr + Dump + nl;
|
||||
end;
|
||||
|
||||
procedure ShowDump;
|
||||
begin
|
||||
// OutputDebugString(pChar(DumpStr));
|
||||
// DumpStr := '';
|
||||
if (_DumpOutput <> nil) then
|
||||
_DumpOutput.Add(Dump);
|
||||
end;
|
||||
|
||||
end.
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@
|
|||
// | http://www.opensource.org/licenses/lgpl-license.php |
|
||||
// +----------------------------------------------------------------------+
|
||||
//
|
||||
// $Id: EUCKRFreq.pas,v 1.2 2007/05/20 15:46:03 ya_nick Exp $
|
||||
// $Id: EUCKRFreq.pas,v 1.3 2013/04/23 19:47:10 ya_nick Exp $
|
||||
|
||||
unit EUCKRFreq;
|
||||
interface
|
||||
|
|
@ -45,7 +45,7 @@ const
|
|||
EUCKR_TABLE_SIZE = 2352;
|
||||
|
||||
//Char to FreqOrder table ,
|
||||
EUCKRCharToFreqOrder: array [0..EUCKR_TABLE_SIZE-1] of PRInt16 =
|
||||
EUCKRCharToFreqOrder: array [0..EUCKR_TABLE_SIZE-1] of int16 =
|
||||
(
|
||||
13, 130, 120,1396, 481,1719,1720, 328, 609, 212,1721, 707, 400, 299,1722, 87,
|
||||
1397,1723, 104, 536,1117,1203,1724,1267, 685,1268, 508,1725,1726,1727,1728,1398,
|
||||
|
|
@ -603,4 +603,3 @@ const
|
|||
);
|
||||
implementation
|
||||
end.
|
||||
|
||||
|
|
|
|||
|
|
@ -31,7 +31,7 @@ type
|
|||
constructor Create;
|
||||
destructor Destroy; override;
|
||||
|
||||
function Sample(aIn: pChar; aLen: integer): Boolean;
|
||||
function Sample(aIn: pAnsiChar; aLen: integer): Boolean;
|
||||
function GetSomeData: Boolean;
|
||||
function EnoughData: Boolean;
|
||||
procedure CalFreq;
|
||||
|
|
@ -114,12 +114,12 @@ begin
|
|||
end;
|
||||
end;
|
||||
|
||||
function TEUCSampler.Sample(aIn: pChar; aLen: integer): Boolean;
|
||||
function TEUCSampler.Sample(aIn: pAnsiChar; aLen: integer): Boolean;
|
||||
const
|
||||
MAX_LENGTH: integer = MaxInt;// $80000000;
|
||||
var
|
||||
i: integer;
|
||||
p: pChar;
|
||||
p: pAnsiChar;
|
||||
begin
|
||||
if (mState = 1) then
|
||||
begin
|
||||
|
|
@ -175,4 +175,4 @@ begin
|
|||
Result := ( mState <> 1 );
|
||||
end;
|
||||
|
||||
end.
|
||||
end.
|
||||
|
|
@ -16,7 +16,7 @@
|
|||
// | http://www.opensource.org/licenses/lgpl-license.php |
|
||||
// +----------------------------------------------------------------------+
|
||||
//
|
||||
// $Id: EUCTWFreq.pas,v 1.2 2007/05/20 15:46:04 ya_nick Exp $
|
||||
// $Id: EUCTWFreq.pas,v 1.3 2013/04/23 19:47:10 ya_nick Exp $
|
||||
|
||||
unit EUCTWFreq;
|
||||
|
||||
|
|
@ -51,7 +51,7 @@ const
|
|||
//Char to FreqOrder table ,
|
||||
EUCTW_TABLE_SIZE = 8102-2742+16;
|
||||
|
||||
EUCTWCharToFreqOrder: array [0..EUCTW_TABLE_SIZE-1] of PRInt16 =
|
||||
EUCTWCharToFreqOrder: array [0..EUCTW_TABLE_SIZE-1] of int16 =
|
||||
(
|
||||
1,1800,1506, 255,1431, 198, 9, 82, 6,7310, 177, 202,3615,1256,2808, 110, // 2742
|
||||
3735, 33,3241, 261, 76, 44,2113, 16,2931,2184,1176, 659,3868, 26,3404,2643, // 2758
|
||||
|
|
@ -437,4 +437,4 @@ const
|
|||
****************************************************************************************)
|
||||
);
|
||||
implementation
|
||||
end.
|
||||
end.
|
||||
|
|
@ -16,7 +16,7 @@
|
|||
// | http://www.opensource.org/licenses/lgpl-license.php |
|
||||
// +----------------------------------------------------------------------+
|
||||
//
|
||||
// $Id: GB2312Freq.pas,v 1.2 2007/05/20 15:46:04 ya_nick Exp $
|
||||
// $Id: GB2312Freq.pas,v 1.3 2013/04/23 19:47:10 ya_nick Exp $
|
||||
|
||||
unit GB2312Freq;
|
||||
|
||||
|
|
@ -47,7 +47,7 @@ const
|
|||
|
||||
GB2312_TABLE_SIZE = 3760;
|
||||
|
||||
GB2312CharToFreqOrder: array [0..GB2312_TABLE_SIZE-1] of PRInt16 =
|
||||
GB2312CharToFreqOrder: array [0..GB2312_TABLE_SIZE-1] of int16 =
|
||||
(
|
||||
1671, 749,1443,2364,3924,3807,2330,3921,1704,3463,2691,1511,1515, 572,3191,2205,
|
||||
2361, 224,2558, 479,1711, 963,3162, 440,4060,1905,2966,2947,3580,2647,3961,3842,
|
||||
|
|
@ -483,4 +483,3 @@ implementation
|
|||
end.
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@
|
|||
// | http://www.opensource.org/licenses/lgpl-license.php |
|
||||
// +----------------------------------------------------------------------+
|
||||
//
|
||||
// $Id: JISFreq.pas,v 1.2 2007/05/20 15:46:04 ya_nick Exp $
|
||||
// $Id: JISFreq.pas,v 1.3 2013/04/23 19:47:10 ya_nick Exp $
|
||||
|
||||
unit JISFreq;
|
||||
|
||||
|
|
@ -49,7 +49,7 @@ const
|
|||
//Char to FreqOrder table ,
|
||||
JIS_TABLE_SIZE = 4368;
|
||||
|
||||
JISCharToFreqOrder: array [0..JIS_TABLE_SIZE-1] of PRInt16 =
|
||||
JISCharToFreqOrder: array [0..JIS_TABLE_SIZE-1] of int16 =
|
||||
(
|
||||
40, 1, 6, 182, 152, 180, 295,2127, 285, 381,3295,4304,3068,4606,3165,3510, // 16
|
||||
3511,1822,2785,4607,1193,2226,5070,4608, 171,2996,1247, 18, 179,5071, 856,1661, // 32
|
||||
|
|
@ -577,4 +577,4 @@ const
|
|||
|
||||
);
|
||||
implementation
|
||||
end.
|
||||
end.
|
||||
|
|
@ -16,13 +16,14 @@
|
|||
// | http://www.opensource.org/licenses/lgpl-license.php |
|
||||
// +----------------------------------------------------------------------+
|
||||
//
|
||||
// $Id: JpCntx.pas,v 1.2 2007/05/20 15:46:05 ya_nick Exp $
|
||||
// $Id: JpCntx.pas,v 1.3 2013/04/23 19:47:10 ya_nick Exp $
|
||||
|
||||
unit JpCntx;
|
||||
|
||||
interface
|
||||
uses
|
||||
nsCore;
|
||||
{$HINTS OFF} // Kylix gives an invalid unused hint for TJapaneseContextAnalysis.GetOrder(str: pAnsiChar)
|
||||
|
||||
const
|
||||
NUM_OF_CATEGORY = 6;
|
||||
|
|
@ -31,9 +32,9 @@ type
|
|||
TJapaneseContextAnalysis = class (TObject)
|
||||
private
|
||||
(*category counters, each interger counts sequence in its category*)
|
||||
mRelSample: array [0..Pred(NUM_OF_CATEGORY)] of PRUint32;
|
||||
mRelSample: array [0..Pred(NUM_OF_CATEGORY)] of uInt32;
|
||||
(*total sequence received*)
|
||||
mTotalRel: PRUint32;
|
||||
mTotalRel: uInt32;
|
||||
(*The order of previous char*)
|
||||
mLastCharOrder: integer;
|
||||
(*if last byte in current buffer is not the last byte of a character, we*)
|
||||
|
|
@ -42,29 +43,29 @@ type
|
|||
(*If this flag is set to PR_TRUE, detection is done and conclusion has been made*)
|
||||
mDone: Boolean;
|
||||
|
||||
function GetOrder(str: PChar; charLen: pPRUint32): PRInt32; overload; virtual; abstract;
|
||||
function GetOrder(str: PChar): PRInt32; overload; virtual; abstract;
|
||||
function GetOrder(str: pAnsiChar; charLen: puInt32): int32; overload; virtual; abstract;
|
||||
function GetOrder(str: pAnsiChar): int32; overload; virtual; abstract;
|
||||
public
|
||||
constructor Create;
|
||||
destructor Destroy; override;
|
||||
procedure Reset;
|
||||
procedure HandleData(const aBuf: PChar; aLen: integer);
|
||||
procedure HandleOneChar(aStr: PChar; aCharLen: integer);
|
||||
procedure HandleData(const aBuf: pAnsiChar; aLen: integer);
|
||||
procedure HandleOneChar(aStr: pAnsiChar; aCharLen: integer);
|
||||
function GotEnoughData: Boolean;
|
||||
function GetConfidence: float;
|
||||
end;
|
||||
|
||||
TSJISContextAnalysis = class (TJapaneseContextAnalysis)
|
||||
public
|
||||
function GetOrder(str: PChar; charLen: pPRUint32): PRInt32; overload; override;
|
||||
function GetOrder(str: PChar): PRInt32; overload; override;
|
||||
function GetOrder(str: pAnsiChar; charLen: puInt32): int32; overload; override;
|
||||
function GetOrder(str: pAnsiChar): int32; overload; override;
|
||||
end;
|
||||
|
||||
TEUCJPContextAnalysis = class (TJapaneseContextAnalysis)
|
||||
public
|
||||
function GetOrder(str: PChar; charLen: pPRUint32): PRInt32; overload; override;
|
||||
function GetOrder(str: pAnsiChar; charLen: puInt32): int32; overload; override;
|
||||
(*We only interested in Hiragana, so first byte is '\244'*)
|
||||
function GetOrder(str: PChar): PRInt32; overload; override;
|
||||
function GetOrder(str: pAnsiChar): int32; overload; override;
|
||||
end;
|
||||
|
||||
implementation
|
||||
|
|
@ -174,9 +175,9 @@ begin
|
|||
inherited;
|
||||
end;
|
||||
|
||||
procedure TJapaneseContextAnalysis.HandleOneChar(aStr: PChar; aCharLen: integer);
|
||||
procedure TJapaneseContextAnalysis.HandleOneChar(aStr: pAnsiChar; aCharLen: integer);
|
||||
var
|
||||
order: PRInt32; (*if we received enough data, stop here *)
|
||||
order: int32; (*if we received enough data, stop here *)
|
||||
begin
|
||||
if mTotalRel > MAX_REL_THRESHOLD then
|
||||
mDone:= TRUE;
|
||||
|
|
@ -210,10 +211,10 @@ begin
|
|||
Result := DONT_KNOW;
|
||||
end;
|
||||
|
||||
procedure TJapaneseContextAnalysis.HandleData(const aBuf: PChar; aLen: integer);
|
||||
procedure TJapaneseContextAnalysis.HandleData(const aBuf: pAnsiChar; aLen: integer);
|
||||
var
|
||||
charLen: PRUint32;
|
||||
order: PRInt32;
|
||||
charLen: uInt32;
|
||||
order: int32;
|
||||
i: integer;
|
||||
begin
|
||||
if mDone then
|
||||
|
|
@ -265,7 +266,7 @@ end;
|
|||
|
||||
{ TSJISContextAnalysis }
|
||||
|
||||
function TSJISContextAnalysis.GetOrder(str: PChar; charLen: pPRUint32): PRInt32;
|
||||
function TSJISContextAnalysis.GetOrder(str: pAnsiChar; charLen: puInt32): int32;
|
||||
begin
|
||||
(*find out current char's byte length*)
|
||||
if (byte(str^) >= $81) and
|
||||
|
|
@ -284,7 +285,7 @@ begin
|
|||
Result:= -1;
|
||||
end;
|
||||
|
||||
function TSJISContextAnalysis.GetOrder(str: PChar): PRInt32;
|
||||
function TSJISContextAnalysis.GetOrder(str: pAnsiChar): int32;
|
||||
begin
|
||||
(*We only interested in Hiragana, so first byte is '\202'*)
|
||||
if (str[0]=#$82) and
|
||||
|
|
@ -297,7 +298,7 @@ end;
|
|||
|
||||
{ TEUCJPContextAnalysis }
|
||||
|
||||
function TEUCJPContextAnalysis.GetOrder(str: PChar; charLen: pPRUint32): PRInt32;
|
||||
function TEUCJPContextAnalysis.GetOrder(str: pAnsiChar; charLen: puInt32): int32;
|
||||
begin
|
||||
(*find out current char's byte length*)
|
||||
if (byte(str^) = $8e) or
|
||||
|
|
@ -318,7 +319,7 @@ begin
|
|||
Result:= -1;
|
||||
end;
|
||||
|
||||
function TEUCJPContextAnalysis.GetOrder(str: PChar): PRInt32;
|
||||
function TEUCJPContextAnalysis.GetOrder(str: pAnsiChar): int32;
|
||||
begin
|
||||
if (str[0]=#$A4) and
|
||||
(byte(str[1]) >= $a1) and
|
||||
|
|
@ -328,4 +329,4 @@ begin
|
|||
Result := -1;
|
||||
end;
|
||||
|
||||
end.
|
||||
end.
|
||||
|
|
@ -16,7 +16,7 @@
|
|||
// | http://www.opensource.org/licenses/lgpl-license.php |
|
||||
// +----------------------------------------------------------------------+
|
||||
//
|
||||
// $Id: MBUnicodeMultiProber.pas,v 1.2 2007/05/26 13:09:38 ya_nick Exp $
|
||||
// $Id: MBUnicodeMultiProber.pas,v 1.3 2013/04/23 19:47:10 ya_nick Exp $
|
||||
|
||||
unit MBUnicodeMultiProber;
|
||||
|
||||
|
|
@ -32,7 +32,7 @@ type
|
|||
public
|
||||
constructor Create; reintroduce;
|
||||
destructor Destroy; override;
|
||||
function HandleData(aBuf: PChar; aLen: integer): eProbingState; override;
|
||||
function HandleData(aBuf: pAnsiChar; aLen: integer): eProbingState; override;
|
||||
// function GetConfidence: double; override;
|
||||
end;
|
||||
|
||||
|
|
@ -50,6 +50,7 @@ uses
|
|||
{ TMBUnicodeMultiProber }
|
||||
const
|
||||
NUM_OF_PROBERS = 3;
|
||||
{$IFDEF FPC}{$NOTES OFF}{$ENDIF}
|
||||
ONE_CHAR_PROB: float = 0.50;
|
||||
|
||||
{$ifdef DEBUG_chardet}
|
||||
|
|
@ -69,11 +70,11 @@ begin
|
|||
inherited;
|
||||
end;
|
||||
|
||||
function TMBUnicodeMultiProber.HandleData(aBuf: PChar; aLen: integer): eProbingState;
|
||||
function TMBUnicodeMultiProber.HandleData(aBuf: pAnsiChar; aLen: integer): eProbingState;
|
||||
var
|
||||
i: integer; (*do filtering to reduce load to probers*)
|
||||
highbyteBuf: PChar;
|
||||
hptr: PChar;
|
||||
highbyteBuf: pAnsiChar;
|
||||
hptr: pAnsiChar;
|
||||
keepNext: Boolean;
|
||||
begin
|
||||
keepNext := TRUE;
|
||||
|
|
@ -142,4 +143,4 @@ end;
|
|||
// mDetectedCharset := UNKNOWN_CHARSET;
|
||||
//end;
|
||||
|
||||
end.
|
||||
end.
|
||||
|
|
@ -21,7 +21,7 @@ type
|
|||
constructor Create; override;
|
||||
destructor Destroy; override;
|
||||
|
||||
function HandleData(aBuf: PChar; aLen: integer): eProbingState; override;
|
||||
function HandleData(aBuf: pAnsiChar; aLen: integer): eProbingState; override;
|
||||
function GetDetectedCharset: eInternalCharsetID; override;
|
||||
procedure Reset; override;
|
||||
function EnableCharset(Charset: eInternalCharsetID; NewValue: Boolean): Boolean;
|
||||
|
|
@ -76,7 +76,7 @@ begin
|
|||
Result := mDetectedCharset;
|
||||
end;
|
||||
|
||||
function TMultiModelProber.HandleData(aBuf: PChar; aLen: integer): eProbingState;
|
||||
function TMultiModelProber.HandleData(aBuf: pAnsiChar; aLen: integer): eProbingState;
|
||||
var
|
||||
codingState: nsSMState;
|
||||
j: integer;
|
||||
|
|
@ -227,4 +227,3 @@ end;
|
|||
{$endif}
|
||||
|
||||
end.
|
||||
|
||||
|
|
|
|||
|
|
@ -1,100 +0,0 @@
|
|||
// +----------------------------------------------------------------------+
|
||||
// | chsdet - Charset Detector Library |
|
||||
// +----------------------------------------------------------------------+
|
||||
// | Copyright (C) 2006, Nick Yakowlew http://chsdet.sourceforge.net |
|
||||
// +----------------------------------------------------------------------+
|
||||
// | Based on Mozilla sources http://www.mozilla.org/projects/intl/ |
|
||||
// +----------------------------------------------------------------------+
|
||||
// | This library is free software; you can redistribute it and/or modify |
|
||||
// | it under the terms of the GNU General Public License as published by |
|
||||
// | the Free Software Foundation; either version 2 of the License, or |
|
||||
// | (at your option) any later version. |
|
||||
// | This library is distributed in the hope that it will be useful |
|
||||
// | but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
||||
// | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
|
||||
// | See the GNU Lesser General Public License for more details. |
|
||||
// | http://www.opensource.org/licenses/lgpl-license.php |
|
||||
// +----------------------------------------------------------------------+
|
||||
//
|
||||
// $Id: chsdIntf.pas,v 1.4 2008/06/22 09:04:20 ya_nick Exp $
|
||||
|
||||
unit chsdIntf;
|
||||
|
||||
interface
|
||||
|
||||
uses
|
||||
nsCore;
|
||||
|
||||
procedure csd_Reset; stdcall;
|
||||
function csd_HandleData(aBuf: PChar; aLen: integer): integer; stdcall;
|
||||
function csd_Done: boolean; stdcall;
|
||||
procedure csd_DataEnd; stdcall;
|
||||
function csd_GetDetectedCharset: rCharsetInfo; stdcall;
|
||||
function csd_GetKnownCharsets(var KnownCharsets: pChar): integer; stdcall;
|
||||
procedure csd_GetAbout(var About: rAboutHolder); stdcall;
|
||||
function csd_GetDetectedBOM: eBOMKind; stdcall;
|
||||
procedure csd_DisableCharsetCP(CodePage: integer); stdcall;
|
||||
|
||||
|
||||
implementation
|
||||
uses
|
||||
nsUniversalDetector;
|
||||
|
||||
var
|
||||
Detector: TnsUniversalDetector = nil;
|
||||
|
||||
procedure csd_Reset; stdcall;
|
||||
begin
|
||||
Detector.Reset;
|
||||
end;
|
||||
|
||||
function csd_HandleData(aBuf: PChar; aLen: integer): integer; stdcall;
|
||||
begin
|
||||
Result := Detector.HandleData(aBuf, aLen);
|
||||
end;
|
||||
|
||||
function csd_Done: boolean; stdcall;
|
||||
begin
|
||||
Result := Detector.Done;
|
||||
end;
|
||||
|
||||
procedure csd_DataEnd; stdcall;
|
||||
begin
|
||||
Detector.DataEnd;
|
||||
end;
|
||||
|
||||
function csd_GetDetectedCharset: rCharsetInfo; stdcall;
|
||||
begin
|
||||
Result := Detector.GetDetectedCharsetInfo;
|
||||
end;
|
||||
|
||||
function csd_GetKnownCharsets(var KnownCharsets: pChar): integer; stdcall;
|
||||
begin
|
||||
Result := Detector.GetKnownCharset(KnownCharsets);
|
||||
end;
|
||||
|
||||
procedure csd_GetAbout(var About: rAboutHolder); stdcall;
|
||||
begin
|
||||
Detector.GetAbout(About);
|
||||
end;
|
||||
|
||||
function csd_GetDetectedBOM: eBOMKind; stdcall;
|
||||
begin
|
||||
Result := Detector.BOMDetected;
|
||||
end;
|
||||
|
||||
procedure csd_DisableCharsetCP(CodePage: integer); stdcall;
|
||||
begin
|
||||
Detector.DisableCharset(CodePage);
|
||||
end;
|
||||
|
||||
initialization
|
||||
Detector := TnsUniversalDetector.Create;
|
||||
|
||||
finalization
|
||||
if Detector <> nil then
|
||||
Detector.Free;
|
||||
|
||||
end.
|
||||
|
||||
|
||||
|
|
@ -1,161 +0,0 @@
|
|||
[FileVersion]
|
||||
Version=6.0
|
||||
|
||||
[Compiler]
|
||||
A=8
|
||||
B=0
|
||||
C=1
|
||||
D=1
|
||||
E=0
|
||||
F=0
|
||||
G=1
|
||||
H=1
|
||||
I=1
|
||||
J=1
|
||||
K=0
|
||||
L=1
|
||||
M=0
|
||||
N=1
|
||||
O=1
|
||||
P=1
|
||||
Q=0
|
||||
R=0
|
||||
S=0
|
||||
T=0
|
||||
U=1
|
||||
V=1
|
||||
W=0
|
||||
X=1
|
||||
Y=2
|
||||
Z=1
|
||||
ShowHints=1
|
||||
ShowWarnings=1
|
||||
UnitAliases=WinTypes=Windows;WinProcs=Windows;DbiTypes=BDE;DbiProcs=BDE;DbiErrs=BDE;
|
||||
|
||||
[Linker]
|
||||
MapFile=0
|
||||
OutputObjs=0
|
||||
ConsoleApp=1
|
||||
DebugInfo=0
|
||||
RemoteSymbols=0
|
||||
MinStackSize=16384
|
||||
MaxStackSize=1048576
|
||||
ImageBase=4194304
|
||||
ExeDescription=
|
||||
|
||||
[Directories]
|
||||
OutputDir=..\
|
||||
UnitOutputDir=..\dcu
|
||||
PackageDLLOutputDir=
|
||||
PackageDCPOutputDir=
|
||||
SearchPath=.\mbclass;.\sbseq;.\stat
|
||||
Packages=VCL50;VCLX50;VCLSMP50;QRPT50;VCLDB50;VCLIE50;INETDB50;INET50;NMFAST50;dclocx50;dclaxserver50;DJCL50;JVAPPFRMD5R;JVCORED5R;JVBANDSD5R;JVDLGSD5R;JVCMPD5R;JVCRYPTD5R;JVCTRLSD5R;JVCUSTOMD5R;JVDOCKINGD5R;JVDOTNETCTRLSD5R;JVEDID5R;JVGLOBUSD5R;JVHMID5R;JVINSPECTORD5R;JVINTERPRETERD5R;JVJANSD5R;JVMANAGEDTHREADSD5R;JVMMD5R;JVNETD5R;JVSTDCTRLSD5R;JVPAGECOMPSD5R;JVPLUGIND5R;JVPRINTPREVIEWD5R;JVSYSTEMD5R;JVTIMEFRAMEWORKD5R;JVUIBD5R;JVVALIDATORSD5R;JVWIZARDD5R;JVXPCTRLSD5R;vcl
|
||||
Conditionals=
|
||||
DebugSourceDirs=
|
||||
UsePackages=0
|
||||
|
||||
[Parameters]
|
||||
RunParams=
|
||||
HostApplication=
|
||||
Launcher=
|
||||
UseLauncher=0
|
||||
DebugCWD=
|
||||
|
||||
[Language]
|
||||
ActiveLang=
|
||||
ProjectLang=$00000407
|
||||
|
||||
[Version Info]
|
||||
IncludeVerInfo=1
|
||||
AutoIncBuild=0
|
||||
MajorVer=0
|
||||
MinorVer=2
|
||||
Release=6
|
||||
Build=2
|
||||
Debug=0
|
||||
PreRelease=0
|
||||
Special=0
|
||||
Private=0
|
||||
DLL=1
|
||||
Locale=2057
|
||||
CodePage=1252
|
||||
|
||||
[Version Info Keys]
|
||||
CompanyName=
|
||||
FileDescription=Charset detector
|
||||
FileVersion=0.2.6.2
|
||||
InternalName=
|
||||
LegalCopyright=Nick Yakowlew, ya_nick@users.sourceforge.net
|
||||
LegalTrademarks=
|
||||
OriginalFilename=chsdet.dll
|
||||
ProductName=Charset detector
|
||||
ProductVersion=0.2
|
||||
Comments=LGPL Licence
|
||||
|
||||
[Excluded Packages]
|
||||
E:\Data\Yan\Delphi\log4delphi\bin\log4delphi_D6.bpl=Log4Delphi 0.5
|
||||
c:\program files\borland\delphi6\Bin\DCLNMF60.bpl=NetMasters Fastnet Tools
|
||||
C:\PROGRAM FILES\BORLAND\DELPHI6\PROJECTS\BPL\TBX_D6.BPL=Toolbar2000 -- TBX Extensions (Alex Denisov)
|
||||
C:\PROGRAM FILES\BORLAND\DELPHI6\PROJECTS\BPL\TB2K_D6.BPL=Toolbar2000 Components (Jordan Russell)
|
||||
C:\PROGRAM FILES\BORLAND\DELPHI6\PROJECTS\BPL\TBXDSGN_D6.BPL=Toolbar2000 -- TBX Extensions Design Package (Alex Denisov)
|
||||
C:\PROGRAM FILES\BORLAND\DELPHI6\PROJECTS\BPL\TB2KDSGN_D6.BPL=Toolbar2000 Design Package (Jordan Russell)
|
||||
c:\program files\borland\delphi6\Projects\Bpl\IEDcomp.bpl=Internet EDiting components
|
||||
C:\PROGRAM FILES\BORLAND\DELPHI6\PROJECTS\BPL\TNTUNICODEVCL_D60.BPL=Tnt Unicode Controls
|
||||
c:\program files\borland\delphi6\Projects\Bpl\SmpltCP.bpl=(untitled)
|
||||
c:\program files\borland\delphi6\Projects\Bpl\devFileMonitorPkg.bpl=(untitled)
|
||||
c:\program files\borland\delphi6\Bin\dclsoap60.bpl=Borland SOAP Components
|
||||
c:\program files\borland\delphi6\Projects\Bpl\SpTBXLibDsgn_d6.bpl=Toolbar2000 -- SpTBXLib Design Package
|
||||
c:\program files\borland\delphi6\Projects\Bpl\LSFindReplaceDialogW_6.bpl=LS Find/Replace Dialog for Wide Strings
|
||||
c:\program files\borland\delphi6\Projects\Bpl\Unicode6.bpl=Unicode components
|
||||
c:\program files\borland\delphi6\Projects\Bpl\credit.bpl=(untitled)
|
||||
c:\program files\borland\delphi6\Projects\Bpl\pActivePorts.bpl=LGM ActivePorts Component
|
||||
c:\program files\borland\delphi6\Projects\Bpl\USE.bpl=Unicode Syntax Edit control
|
||||
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvAppFrmD6D.bpl=JVCL Application and Form Components
|
||||
C:\PROGRAM FILES\BORLAND\DELPHI6\PROJECTS\BPL\JVCORED6D.BPL=JVCL Core Components
|
||||
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvCmpD6D.bpl=JVCL Non-Visual Components
|
||||
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvCryptD6D.bpl=JVCL Encryption and Compression Components
|
||||
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvCtrlsD6D.bpl=JVCL Visual Controls
|
||||
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvCustomD6D.bpl=JVCL Custom Controls
|
||||
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvDlgsD6D.bpl=JVCL Dialog Components
|
||||
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvDockingD6D.bpl=JVCL Docking Components
|
||||
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvGlobusD6D.bpl=JVCL Globus Components
|
||||
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvHMID6D.bpl=JVCL HMI Controls design time unit
|
||||
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvJansD6D.bpl=JVCL Jans Components
|
||||
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvManagedThreadsD6D.bpl=JVCL Managed Threads
|
||||
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvMMD6D.bpl=JVCL Multimedia and Image Components
|
||||
C:\PROGRAM FILES\BORLAND\DELPHI6\PROJECTS\BPL\JVSTDCTRLSD6D.BPL=JVCL Standard Controls
|
||||
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvPageCompsD6D.bpl=JVCL Page Style Components
|
||||
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvPluginD6D.bpl=JVCL Plugin Components
|
||||
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvSystemD6D.bpl=JVCL System Components
|
||||
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvTimeFrameworkD6D.bpl=JVCL Time Framework
|
||||
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvValidatorsD6D.bpl=JVCL Validators and Error Provider Components
|
||||
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvXPCtrlsD6D.bpl=JVCL XP Controls
|
||||
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvBandsD6D.bpl=JVCL Band Objects
|
||||
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvBDED6D.bpl=JVCL BDE Components
|
||||
C:\PROGRAM FILES\BORLAND\DELPHI6\PROJECTS\BPL\JVDBD6D.BPL=JVCL Database Components
|
||||
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvDotNetCtrlsD6D.bpl=JVCL DotNet Controls
|
||||
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvEDID6D.bpl=JVCL EDI Components Designtime Package
|
||||
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvInspectorD6D.bpl=JVCL Inspector Components
|
||||
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvInterpreterD6D.bpl=JVCL Interpreter Components
|
||||
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvNetD6D.bpl=JVCL Network Components
|
||||
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvPrintPreviewD6D.bpl=JVCL Print Preview Components
|
||||
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvUIBD6D.bpl=JVCL Unified Interbase Components
|
||||
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvWizardD6D.bpl=JVCL Wizard Design Time Package
|
||||
c:\program files\borland\delphi6\Projects\Bpl\components.bpl=Components for tsWebEditor
|
||||
c:\program files\borland\delphi6\Projects\Bpl\CoolTrayIcon_D6plus.bpl=CoolTrayIcon and Friends
|
||||
C:\PROGRAM FILES\BORLAND\DELPHI6\BIN\DCLBDE60.BPL=Borland BDE DB Components
|
||||
C:\PROGRAM FILES\BORLAND\DELPHI6\BIN\DBX60.BPL=Borland SQL Explorer UI Package
|
||||
c:\program files\borland\delphi6\Projects\Bpl\ClassBrowsing.bpl=ClassBrowsing components
|
||||
c:\program files\borland\delphi6\Bin\dclqrt60.bpl=QuickReport Components
|
||||
c:\program files\borland\delphi6\Bin\dclcds60.bpl=Borland Base Cached ClientDataset Component
|
||||
C:\PROGRAM FILES\BORLAND\DELPHI6\BIN\DCLMID60.BPL=Borland MyBase DataAccess Components
|
||||
c:\program files\borland\delphi6\Bin\dclbdecds60.bpl=Borland Local BDE ClientDataset Components
|
||||
c:\program files\borland\delphi6\Bin\dcltee60.bpl=TeeChart Components
|
||||
c:\program files\borland\delphi6\Bin\dcltqr60.bpl=TeeChart for QuickReport Components
|
||||
c:\program files\borland\delphi6\Bin\dclib60.bpl=InterBase Data Access Components
|
||||
c:\program files\borland\delphi6\Bin\dcldbxcds60.bpl=Borland Local DBX ClientDataset Components
|
||||
c:\program files\borland\delphi6\Bin\DBWEBXPRT.BPL=Borland Web Wizard Package
|
||||
c:\program files\borland\delphi6\Projects\Bpl\prgInternet6.bpl=Progsan Internet Components
|
||||
c:\program files\borland\delphi6\Projects\Bpl\Comps_D6.bpl=(untitled)
|
||||
c:\program files\borland\delphi6\Projects\Bpl\SynEdit_D6.bpl=SynEdit component suite
|
||||
c:\program files\borland\delphi6\Projects\Bpl\DevCpp.bpl=Dev-c++ components
|
||||
|
|
@ -1,38 +0,0 @@
|
|||
// +----------------------------------------------------------------------+
|
||||
// | chsdet - Charset Detector Library |
|
||||
// +----------------------------------------------------------------------+
|
||||
// | Copyright (C) 2006, Nick Yakowlew http://chsdet.sourceforge.net |
|
||||
// +----------------------------------------------------------------------+
|
||||
// | Based on Mozilla sources http://www.mozilla.org/projects/intl/ |
|
||||
// +----------------------------------------------------------------------+
|
||||
// | This library is free software; you can redistribute it and/or modify |
|
||||
// | it under the terms of the GNU General Public License as published by |
|
||||
// | the Free Software Foundation; either version 2 of the License, or |
|
||||
// | (at your option) any later version. |
|
||||
// | This library is distributed in the hope that it will be useful |
|
||||
// | but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
||||
// | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
|
||||
// | See the GNU Lesser General Public License for more details. |
|
||||
// | http://www.opensource.org/licenses/lgpl-license.php |
|
||||
// +----------------------------------------------------------------------+
|
||||
//
|
||||
// $Id: chsdet.dpr,v 1.3 2007/05/26 13:07:21 ya_nick Exp $
|
||||
|
||||
library chsdet;
|
||||
|
||||
uses
|
||||
chsdIntf in 'chsdIntf.pas';
|
||||
|
||||
exports
|
||||
csd_Reset,
|
||||
csd_HandleData,
|
||||
csd_Done,
|
||||
csd_DataEnd,
|
||||
csd_GetDetectedCharset,
|
||||
csd_GetKnownCharsets,
|
||||
csd_GetAbout;
|
||||
{$R *.res}
|
||||
|
||||
begin
|
||||
end.
|
||||
|
||||
|
|
@ -1,172 +0,0 @@
|
|||
<?xml version="1.0"?>
|
||||
<CONFIG>
|
||||
<ProjectOptions>
|
||||
<PathDelim Value="/"/>
|
||||
<Version Value="6"/>
|
||||
<General>
|
||||
<Flags>
|
||||
<MainUnitHasUsesSectionForAllUnits Value="False"/>
|
||||
<MainUnitHasCreateFormStatements Value="False"/>
|
||||
<MainUnitHasTitleStatement Value="False"/>
|
||||
<Runnable Value="False"/>
|
||||
</Flags>
|
||||
<MainUnit Value="0"/>
|
||||
<IconPath Value="./"/>
|
||||
<TargetFileExt Value=""/>
|
||||
<UseAppBundle Value="False"/>
|
||||
<ActiveEditorIndexAtStart Value="0"/>
|
||||
</General>
|
||||
<VersionInfo>
|
||||
<UseVersionInfo Value="True"/>
|
||||
<ProjectVersion Value=""/>
|
||||
</VersionInfo>
|
||||
<PublishOptions>
|
||||
<Version Value="2"/>
|
||||
<IgnoreBinaries Value="False"/>
|
||||
<IncludeFileFilter Value="*.(pas|pp|inc|lfm|lpr|lrs|lpi|lpk|sh|xml)"/>
|
||||
<ExcludeFileFilter Value="*.(bak|ppu|ppw|o|so);*~;backup"/>
|
||||
</PublishOptions>
|
||||
<RunParams>
|
||||
<local>
|
||||
<FormatVersion Value="1"/>
|
||||
<LaunchingApplication PathPlusParams="/usr/X11R6/bin/xterm -T 'Lazarus Run Output' -e $(LazarusDir)/tools/runwait.sh $(TargetCmdLine)"/>
|
||||
</local>
|
||||
</RunParams>
|
||||
<RequiredPackages Count="1">
|
||||
<Item1>
|
||||
<PackageName Value="LCL"/>
|
||||
</Item1>
|
||||
</RequiredPackages>
|
||||
<Units Count="10">
|
||||
<Unit0>
|
||||
<Filename Value="chsdet.lpr"/>
|
||||
<IsPartOfProject Value="True"/>
|
||||
<UnitName Value="chsdet"/>
|
||||
<CursorPos X="9" Y="38"/>
|
||||
<TopLine Value="1"/>
|
||||
<EditorIndex Value="0"/>
|
||||
<UsageCount Value="20"/>
|
||||
<Loaded Value="True"/>
|
||||
</Unit0>
|
||||
<Unit1>
|
||||
<Filename Value=""/>
|
||||
<UsageCount Value="10"/>
|
||||
</Unit1>
|
||||
<Unit2>
|
||||
<Filename Value="nsSBCharSetProber.pas"/>
|
||||
<UnitName Value="nsSBCharSetProber"/>
|
||||
<CursorPos X="1" Y="24"/>
|
||||
<TopLine Value="12"/>
|
||||
<EditorIndex Value="2"/>
|
||||
<UsageCount Value="10"/>
|
||||
<Loaded Value="True"/>
|
||||
</Unit2>
|
||||
<Unit3>
|
||||
<Filename Value="sbseq/LangHebrewModel.pas"/>
|
||||
<IsPartOfProject Value="True"/>
|
||||
<UsageCount Value="20"/>
|
||||
<SyntaxHighlighter Value="Text"/>
|
||||
</Unit3>
|
||||
<Unit4>
|
||||
<Filename Value="sbseq/LangBulgarianModel.pas"/>
|
||||
<IsPartOfProject Value="True"/>
|
||||
<UsageCount Value="20"/>
|
||||
<SyntaxHighlighter Value="Text"/>
|
||||
</Unit4>
|
||||
<Unit5>
|
||||
<Filename Value="sbseq/LangCyrillicModel.pas"/>
|
||||
<IsPartOfProject Value="True"/>
|
||||
<UsageCount Value="20"/>
|
||||
<SyntaxHighlighter Value="Text"/>
|
||||
</Unit5>
|
||||
<Unit6>
|
||||
<Filename Value="sbseq/LangGreekModel.pas"/>
|
||||
<IsPartOfProject Value="True"/>
|
||||
<UsageCount Value="20"/>
|
||||
<SyntaxHighlighter Value="Text"/>
|
||||
</Unit6>
|
||||
<Unit7>
|
||||
<Filename Value="chsdIntf.pas"/>
|
||||
<UnitName Value="chsdIntf"/>
|
||||
<CursorPos X="13" Y="31"/>
|
||||
<TopLine Value="9"/>
|
||||
<EditorIndex Value="1"/>
|
||||
<UsageCount Value="10"/>
|
||||
<Loaded Value="True"/>
|
||||
</Unit7>
|
||||
<Unit8>
|
||||
<Filename Value="nsCore.pas"/>
|
||||
<UnitName Value="nsCore"/>
|
||||
<CursorPos X="1" Y="1"/>
|
||||
<TopLine Value="17"/>
|
||||
<UsageCount Value="10"/>
|
||||
</Unit8>
|
||||
<Unit9>
|
||||
<Filename Value="dbg.inc"/>
|
||||
<CursorPos X="1" Y="1"/>
|
||||
<TopLine Value="1"/>
|
||||
<UsageCount Value="10"/>
|
||||
</Unit9>
|
||||
</Units>
|
||||
<JumpHistory Count="6" HistoryIndex="5">
|
||||
<Position1>
|
||||
<Filename Value="nsSBCharSetProber.pas"/>
|
||||
<Caret Line="46" Column="19" TopLine="24"/>
|
||||
</Position1>
|
||||
<Position2>
|
||||
<Filename Value="nsSBCharSetProber.pas"/>
|
||||
<Caret Line="28" Column="10" TopLine="12"/>
|
||||
</Position2>
|
||||
<Position3>
|
||||
<Filename Value="nsSBCharSetProber.pas"/>
|
||||
<Caret Line="24" Column="1" TopLine="24"/>
|
||||
</Position3>
|
||||
<Position4>
|
||||
<Filename Value="nsSBCharSetProber.pas"/>
|
||||
<Caret Line="46" Column="19" TopLine="24"/>
|
||||
</Position4>
|
||||
<Position5>
|
||||
<Filename Value="chsdet.lpr"/>
|
||||
<Caret Line="35" Column="17" TopLine="1"/>
|
||||
</Position5>
|
||||
<Position6>
|
||||
<Filename Value="chsdet.lpr"/>
|
||||
<Caret Line="32" Column="12" TopLine="1"/>
|
||||
</Position6>
|
||||
</JumpHistory>
|
||||
</ProjectOptions>
|
||||
<CompilerOptions>
|
||||
<Version Value="5"/>
|
||||
<SearchPaths>
|
||||
<UnitOutputDirectory Value="/home/yan/chsdet/dcu"/>
|
||||
</SearchPaths>
|
||||
<Parsing>
|
||||
<SyntaxOptions>
|
||||
<DelphiCompat Value="True"/>
|
||||
</SyntaxOptions>
|
||||
</Parsing>
|
||||
<CodeGeneration>
|
||||
<SmartLinkUnit Value="True"/>
|
||||
<Generate Value="Faster"/>
|
||||
<TargetProcessor Value="3"/>
|
||||
<TargetCPU Value="i386"/>
|
||||
<TargetOS Value="Linux"/>
|
||||
</CodeGeneration>
|
||||
<Linking>
|
||||
<LinkSmart Value="True"/>
|
||||
</Linking>
|
||||
<Other>
|
||||
<CompilerPath Value="$(CompPath)"/>
|
||||
</Other>
|
||||
</CompilerOptions>
|
||||
<Debugging>
|
||||
<Exceptions Count="2">
|
||||
<Item1>
|
||||
<Name Value="ECodetoolError"/>
|
||||
</Item1>
|
||||
<Item2>
|
||||
<Name Value="EFOpenError"/>
|
||||
</Item2>
|
||||
</Exceptions>
|
||||
</Debugging>
|
||||
</CONFIG>
|
||||
|
|
@ -1,44 +0,0 @@
|
|||
// +----------------------------------------------------------------------+
|
||||
// | chsdet - Charset Detector Library |
|
||||
// +----------------------------------------------------------------------+
|
||||
// | Copyright (C) 2006, Nick Yakowlew http://chsdet.sourceforge.net |
|
||||
// +----------------------------------------------------------------------+
|
||||
// | Based on Mozilla sources http://www.mozilla.org/projects/intl/ |
|
||||
// +----------------------------------------------------------------------+
|
||||
// | This library is free software; you can redistribute it and/or modify |
|
||||
// | it under the terms of the GNU General Public License as published by |
|
||||
// | the Free Software Foundation; either version 2 of the License, or |
|
||||
// | (at your option) any later version. |
|
||||
// | This library is distributed in the hope that it will be useful |
|
||||
// | but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
||||
// | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
|
||||
// | See the GNU Lesser General Public License for more details. |
|
||||
// | http://www.opensource.org/licenses/lgpl-license.php |
|
||||
// +----------------------------------------------------------------------+
|
||||
//
|
||||
// $Id: chsdet.lpr,v 1.1 2009/03/20 17:40:22 ya_nick Exp $
|
||||
|
||||
library chsdet;
|
||||
|
||||
uses
|
||||
chsdIntf in 'chsdIntf.pas',
|
||||
LangBulgarianModel in 'sbseq/LangBulgarianModel.pas',
|
||||
LangCyrillicModel in 'sbseq/LangCyrillicModel.pas',
|
||||
LangGreekModel in 'sbseq/LangGreekModel.pas',
|
||||
LangHebrewModel in 'sbseq/LangHebrewModel.pas' ;
|
||||
|
||||
exports
|
||||
csd_Reset,
|
||||
csd_HandleData,
|
||||
csd_Done,
|
||||
csd_DataEnd,
|
||||
csd_GetDetectedCharset,
|
||||
csd_GetKnownCharsets,
|
||||
csd_GetAbout;
|
||||
{.chsdet$R *.res}
|
||||
|
||||
{.chsdet$R chsdet.res}
|
||||
|
||||
begin
|
||||
end.
|
||||
|
||||
|
|
@ -16,7 +16,7 @@
|
|||
// | http://www.opensource.org/licenses/lgpl-license.php |
|
||||
// +----------------------------------------------------------------------+
|
||||
//
|
||||
// $Id: nsCodingStateMachine.pas,v 1.3 2007/05/26 13:09:38 ya_nick Exp $
|
||||
// $Id: nsCodingStateMachine.pas,v 1.4 2013/04/23 19:47:10 ya_nick Exp $
|
||||
|
||||
unit nsCodingStateMachine;
|
||||
|
||||
|
|
@ -38,9 +38,9 @@ type
|
|||
type
|
||||
SMModel = record
|
||||
classTable: Pointer; //nsPkgInt;
|
||||
classFactor: PRUint32;
|
||||
classFactor: uInt32;
|
||||
stateTable: Pointer; //nsPkgInt;
|
||||
charLenTable: Pointer; // pByteArray; // array of byte; // pPRUint32;
|
||||
charLenTable: Pointer; // aByteArray; // array of byte; // puInt32;
|
||||
CharsetID: eInternalCharsetID;
|
||||
end;
|
||||
pSMModel = ^SMModel;
|
||||
|
|
@ -49,16 +49,16 @@ type
|
|||
TnsCodingStateMachine = class (TObject)
|
||||
protected
|
||||
mCurrentState: nsSMState;
|
||||
mCurrentCharLen: PRUint32;
|
||||
mCurrentBytePos: PRUint32;
|
||||
mCurrentCharLen: uInt32;
|
||||
mCurrentBytePos: uInt32;
|
||||
mModel: SMModel;
|
||||
|
||||
public
|
||||
Enabled: Boolean;
|
||||
constructor Create(sm: SMModel);
|
||||
destructor Destroy; override;
|
||||
function NextState(c: char): nsSMState;
|
||||
function GetCurrentCharLen: PRUint32;
|
||||
function NextState(c: AnsiChar): nsSMState;
|
||||
function GetCurrentCharLen: uInt32;
|
||||
procedure Reset;
|
||||
function GetCharsetID: eInternalCharsetID;
|
||||
|
||||
|
|
@ -85,9 +85,9 @@ begin
|
|||
inherited;
|
||||
end;
|
||||
|
||||
function TnsCodingStateMachine.NextState(c: char): nsSMState;
|
||||
function TnsCodingStateMachine.NextState(c: AnsiChar): nsSMState;
|
||||
var
|
||||
byteCls: PRUint32;
|
||||
byteCls: uInt32;
|
||||
begin
|
||||
if not Enabled then
|
||||
begin
|
||||
|
|
@ -95,14 +95,14 @@ begin
|
|||
exit;
|
||||
end;
|
||||
(*for each byte we get its class , if it is first byte, we also get byte length*)
|
||||
byteCls := pByteArray(mModel.classTable)[integer(c)];
|
||||
byteCls := aByteArray(mModel.classTable)[integer(c)];
|
||||
if mCurrentState = eStart then
|
||||
begin
|
||||
mCurrentBytePos := 0;
|
||||
mCurrentCharLen := pByteArray(mModel.charLenTable)[byteCls];
|
||||
mCurrentCharLen := aByteArray(mModel.charLenTable)[byteCls];
|
||||
end;
|
||||
(*from byte's class and stateTable, we get its next state*)
|
||||
mCurrentState := nsSMState(pByteArray(mModel.stateTable)[cardinal(mCurrentState) * mModel.classFactor + byteCls]);
|
||||
mCurrentState := nsSMState(aByteArray(mModel.stateTable)[cardinal(mCurrentState) * mModel.classFactor + byteCls]);
|
||||
inc(mCurrentBytePos);
|
||||
|
||||
//if mCurrentBytePos > mCurrentCharLen then
|
||||
|
|
@ -111,7 +111,7 @@ begin
|
|||
Result:= mCurrentState;
|
||||
end;
|
||||
|
||||
function TnsCodingStateMachine.GetCurrentCharLen: PRUint32;
|
||||
function TnsCodingStateMachine.GetCurrentCharLen: uInt32;
|
||||
begin
|
||||
Result:= mCurrentCharLen;
|
||||
end;
|
||||
|
|
@ -126,4 +126,4 @@ begin
|
|||
Result:= mModel.CharsetID;
|
||||
end;
|
||||
|
||||
end.
|
||||
end.
|
||||
|
|
@ -16,24 +16,23 @@
|
|||
// | http://www.opensource.org/licenses/lgpl-license.php |
|
||||
// +----------------------------------------------------------------------+
|
||||
//
|
||||
// $Id: nsCore.pas,v 1.4 2008/06/22 09:04:20 ya_nick Exp $
|
||||
// $Id: nsCore.pas,v 1.5 2013/04/23 19:47:10 ya_nick Exp $
|
||||
|
||||
unit nsCore;
|
||||
|
||||
interface
|
||||
|
||||
type
|
||||
PRInt16 = smallint;
|
||||
PRUint16 = word;
|
||||
PRInt32 = integer;
|
||||
PRUint32 = cardinal;
|
||||
int16 = smallint;
|
||||
int32 = integer;
|
||||
uInt32 = cardinal;
|
||||
|
||||
pByteArray = array of Byte;
|
||||
pPRUint32 = ^PRUint32;
|
||||
aPRUint32 = array of PRUint32;
|
||||
aByteArray = array of Byte;
|
||||
puInt32 = ^uInt32;
|
||||
auInt32 = array of uInt32;
|
||||
|
||||
pPRint16 = ^PRint16;
|
||||
aPRint16 = array of PRint16;
|
||||
pInt16 = ^int16;
|
||||
aInt16 = array of int16;
|
||||
|
||||
const
|
||||
SURE_YES: double = 0.99;
|
||||
|
|
@ -50,7 +49,7 @@ type
|
|||
);
|
||||
|
||||
type
|
||||
nsResult = PRUint32;
|
||||
nsResult = uInt32;
|
||||
const
|
||||
NS_OK = 0;
|
||||
NS_ERROR_OUT_OF_MEMORY = $8007000e;
|
||||
|
|
@ -76,25 +75,28 @@ type
|
|||
BOM_UTF8 // EF BB BF UTF-8
|
||||
);
|
||||
|
||||
rBOMDef = record
|
||||
Length: integer;
|
||||
BOM: array [0..3] of AnsiChar;
|
||||
end;
|
||||
const
|
||||
KnownBOM: array [eBOMKind] of array [0..4] of Char = (
|
||||
// first element = byte count
|
||||
(#$00, #$00, #$00, #$00, #$00),
|
||||
(#$04, #$00, #$00, #$FE, #$FF),
|
||||
(#$04, #$FF, #$FE, #$00, #$00),
|
||||
(#$04, #$00, #$00, #$FF, #$FE),
|
||||
(#$04, #$FE, #$FF, #$00, #$00),
|
||||
(#$02, #$FE, #$FF, #$00, #$00),
|
||||
(#$02, #$FF, #$FE, #$00, #$00),
|
||||
(#$03, #$EF, #$BB, #$BF, #$00)
|
||||
KNOWN_BOM: array [eBOMKind] of rBOMDef = (
|
||||
(Length: 00; BOM: (#$00, #$00, #$00, #$00)),
|
||||
(Length: 04; BOM: (#$00, #$00, #$FE, #$FF)),
|
||||
(Length: 04; BOM: (#$FF, #$FE, #$00, #$00)),
|
||||
(Length: 04; BOM: (#$00, #$00, #$FF, #$FE)),
|
||||
(Length: 04; BOM: (#$FE, #$FF, #$00, #$00)),
|
||||
(Length: 02; BOM: (#$FE, #$FF, #$00, #$00)),
|
||||
(Length: 02; BOM: (#$FF, #$FE, #$00, #$00)),
|
||||
(Length: 03; BOM: (#$EF, #$BB, #$BF, #$00))
|
||||
);
|
||||
|
||||
// "extended" charset info
|
||||
type
|
||||
rCharsetInfo = record
|
||||
Name: pChar;
|
||||
CodePage: integer;
|
||||
Language: pChar;
|
||||
rCharsetInfo = record
|
||||
Name: PAnsiChar;
|
||||
CodePage: Integer;
|
||||
Language: PAnsiChar;
|
||||
end;
|
||||
|
||||
eInternalCharsetID = (
|
||||
|
|
@ -333,16 +335,16 @@ const
|
|||
(* both functions Allocate a new buffer for newBuf. This buffer should be *)
|
||||
(* freed by the caller using PR_FREEIF.*)
|
||||
(* Both functions return PR_FALSE in case of memory allocation failure.*)
|
||||
function FilterWithoutEnglishLetters(aBuf: PChar; aLen: integer; var newBuf: PChar; var newLen: integer): Boolean;
|
||||
function FilterWithEnglishLetters(aBuf: PChar; aLen: integer; var newBuf: PChar; var newLen: integer): Boolean;
|
||||
function FilterWithoutEnglishLetters(aBuf: pAnsiChar; aLen: integer; var newBuf: pAnsiChar; var newLen: integer): Boolean;
|
||||
function FilterWithEnglishLetters(aBuf: pAnsiChar; aLen: integer; var newBuf: pAnsiChar; var newLen: integer): Boolean;
|
||||
implementation
|
||||
|
||||
function FilterWithEnglishLetters(aBuf: PChar;
|
||||
aLen: integer; var newBuf: PChar; var newLen: integer): Boolean;
|
||||
function FilterWithEnglishLetters(aBuf: pAnsiChar;
|
||||
aLen: integer; var newBuf: pAnsiChar; var newLen: integer): Boolean;
|
||||
var
|
||||
newptr: pChar;
|
||||
prevPtr: pChar;
|
||||
curPtr: pChar;
|
||||
newptr: pAnsiChar;
|
||||
prevPtr: pAnsiChar;
|
||||
curPtr: pAnsiChar;
|
||||
isInTag: Boolean;
|
||||
begin
|
||||
//do filtering to reduce load to probers
|
||||
|
|
@ -403,12 +405,12 @@ begin
|
|||
Result := TRUE;
|
||||
end;
|
||||
|
||||
function FilterWithoutEnglishLetters(aBuf: PChar;
|
||||
aLen: integer; var newBuf: PChar; var newLen: integer): Boolean;
|
||||
function FilterWithoutEnglishLetters(aBuf: pAnsiChar;
|
||||
aLen: integer; var newBuf: pAnsiChar; var newLen: integer): Boolean;
|
||||
var
|
||||
newPtr: pChar;
|
||||
prevPtr: pChar;
|
||||
curPtr: pChar;
|
||||
newPtr: pAnsiChar;
|
||||
prevPtr: pAnsiChar;
|
||||
curPtr: pAnsiChar;
|
||||
meetMSB: Boolean;
|
||||
begin
|
||||
(*This filter applies to all scripts which do not use English characters*)
|
||||
|
|
@ -464,8 +466,3 @@ begin
|
|||
end;
|
||||
|
||||
end.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@
|
|||
// | http://www.opensource.org/licenses/lgpl-license.php |
|
||||
// +----------------------------------------------------------------------+
|
||||
//
|
||||
// $Id: nsEscCharsetProber.pas,v 1.3 2007/05/26 13:09:38 ya_nick Exp $
|
||||
// $Id: nsEscCharsetProber.pas,v 1.4 2013/04/23 19:47:10 ya_nick Exp $
|
||||
|
||||
unit nsEscCharsetProber;
|
||||
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@
|
|||
// | http://www.opensource.org/licenses/lgpl-license.php |
|
||||
// +----------------------------------------------------------------------+
|
||||
//
|
||||
// $Id: nsMBCSMultiProber.pas,v 1.2 2007/05/26 13:09:38 ya_nick Exp $
|
||||
// $Id: nsMBCSMultiProber.pas,v 1.3 2013/04/23 19:47:10 ya_nick Exp $
|
||||
|
||||
unit nsMBCSMultiProber;
|
||||
|
||||
|
|
@ -36,12 +36,12 @@ type
|
|||
mContextAnalysis: array of TJapaneseContextAnalysis;
|
||||
mBestGuess: integer;
|
||||
|
||||
function RunStatAnalyse(aBuf: PChar; aLen: integer): eProbingState;
|
||||
function RunStatAnalyse(aBuf: pAnsiChar; aLen: integer): eProbingState;
|
||||
function GetConfidenceFor(index: integer): double; reintroduce;
|
||||
public
|
||||
constructor Create; reintroduce;
|
||||
destructor Destroy; override;
|
||||
function HandleData(aBuf: PChar; aLen: integer): eProbingState; override;
|
||||
function HandleData(aBuf: pAnsiChar; aLen: integer): eProbingState; override;
|
||||
function GetConfidence: double; override;
|
||||
procedure Reset; override;
|
||||
{$ifdef DEBUG_chardet}
|
||||
|
|
@ -141,11 +141,11 @@ begin
|
|||
end;
|
||||
{$endif}
|
||||
|
||||
function TnsMBCSMultiProber.HandleData(aBuf: PChar; aLen: integer): eProbingState;
|
||||
function TnsMBCSMultiProber.HandleData(aBuf: pAnsiChar; aLen: integer): eProbingState;
|
||||
var
|
||||
i: integer; (*do filtering to reduce load to probers*)
|
||||
highbyteBuf: PChar;
|
||||
hptr: PChar;
|
||||
highbyteBuf: pAnsiChar;
|
||||
hptr: pAnsiChar;
|
||||
keepNext: Boolean;
|
||||
begin
|
||||
keepNext := TRUE;
|
||||
|
|
@ -197,12 +197,12 @@ begin
|
|||
Result := mState;
|
||||
end;
|
||||
|
||||
function TnsMBCSMultiProber.RunStatAnalyse(aBuf: PChar; aLen: integer): eProbingState;
|
||||
function TnsMBCSMultiProber.RunStatAnalyse(aBuf: pAnsiChar; aLen: integer): eProbingState;
|
||||
var
|
||||
i, c: integer;
|
||||
codingState: nsSMState;
|
||||
charLen: byte;
|
||||
mLastChar: array [0..1] of Char;
|
||||
mLastChar: array [0..1] of AnsiChar;
|
||||
begin
|
||||
{$IFDEF DEBUG_chardet}
|
||||
AddDump('MultiByte - Stat Analyse - start');
|
||||
|
|
@ -313,4 +313,4 @@ begin
|
|||
end;
|
||||
end;
|
||||
|
||||
end.
|
||||
end.
|
||||
|
|
@ -16,7 +16,7 @@
|
|||
// | http://www.opensource.org/licenses/lgpl-license.php |
|
||||
// +----------------------------------------------------------------------+
|
||||
//
|
||||
// $Id: nsPkg.pas,v 1.2 2007/05/20 15:46:11 ya_nick Exp $
|
||||
// $Id: nsPkg.pas,v 1.3 2013/04/23 19:47:10 ya_nick Exp $
|
||||
|
||||
unit nsPkg;
|
||||
|
||||
|
|
@ -59,7 +59,7 @@ type
|
|||
sftmsk: nsSftMsk;
|
||||
bitsft: nsBitSft;
|
||||
unitmsk: nsUnitMsk;
|
||||
data: pPRUint32;
|
||||
data: puInt32;
|
||||
end;
|
||||
pnsPkgInt = ^nsPkgInt;
|
||||
|
||||
|
|
@ -88,7 +88,7 @@ end;
|
|||
|
||||
function GETFROMPCK(i: integer; c: pnsPkgInt): integer;
|
||||
begin
|
||||
Result:= (((aPRUint32(c^.data)[i shr integer(c^.idxsft)]) shr (i and integer(c^.sftmsk) shl integer(c^.bitsft))) and integer(c^.unitmsk));
|
||||
Result:= (((auInt32(c^.data)[i shr integer(c^.idxsft)]) shr (i and integer(c^.sftmsk) shl integer(c^.bitsft))) and integer(c^.unitmsk));
|
||||
end;
|
||||
|
||||
end.
|
||||
end.
|
||||
|
|
@ -16,91 +16,90 @@
|
|||
// | http://www.opensource.org/licenses/lgpl-license.php |
|
||||
// +----------------------------------------------------------------------+
|
||||
//
|
||||
// $Id: nsUniversalDetector.pas,v 1.5 2008/06/22 09:04:20 ya_nick Exp $
|
||||
// $Id: nsUniversalDetector.pas,v 1.7 2013/05/16 15:41:14 ya_nick Exp $
|
||||
|
||||
unit nsUniversalDetector;
|
||||
|
||||
interface
|
||||
uses
|
||||
{$I dbg.inc}
|
||||
nsCore,
|
||||
{$I dbg.inc}
|
||||
nsCore,
|
||||
CustomDetector;
|
||||
|
||||
|
||||
const
|
||||
NUM_OF_CHARSET_PROBERS = 4;
|
||||
NUM_OF_CHARSET_PROBERS = 4;
|
||||
|
||||
type nsInputState = (
|
||||
ePureAscii = 0,
|
||||
eEscAscii = 1,
|
||||
eHighbyte = 2
|
||||
) ;
|
||||
type
|
||||
eInputState = (
|
||||
isPureAscii = 0,
|
||||
isEscAscii = 1,
|
||||
isHighbyte = 2
|
||||
);
|
||||
|
||||
TnsUniversalDetector = class (TObject)
|
||||
protected
|
||||
mInputState: nsInputState;
|
||||
mDone: Boolean;
|
||||
mStart: Boolean;
|
||||
mGotData: Boolean;
|
||||
mLastChar: Char;
|
||||
mDetectedCharset: eInternalCharsetID;
|
||||
mCharSetProbers: array [0..Pred(NUM_OF_CHARSET_PROBERS)] of TCustomDetector;
|
||||
mEscCharSetProber: TCustomDetector;
|
||||
mDetectedBOM: eBOMKind;
|
||||
TnsUniversalDetector = class(TObject)
|
||||
protected
|
||||
mInputState: eInputState;
|
||||
mDone: Boolean;
|
||||
mStart: Boolean;
|
||||
mGotData: Boolean;
|
||||
mLastChar: AnsiChar;
|
||||
mDetectedCharset: eInternalCharsetID;
|
||||
mCharSetProbers: array[0..Pred(NUM_OF_CHARSET_PROBERS)] of TCustomDetector;
|
||||
mEscCharSetProber: TCustomDetector;
|
||||
mDetectedBOM: eBOMKind;
|
||||
|
||||
procedure Report(aCharsetID: eInternalCharsetID);
|
||||
function CheckBOM(aBuf: pChar; aLen: integer): integer;
|
||||
function GetCharsetID(CodePage: integer): eInternalCharsetID;
|
||||
procedure DoEnableCharset(Charset: eInternalCharsetID; SetEnabledTo: Boolean);
|
||||
public
|
||||
constructor Create;
|
||||
destructor Destroy; override;
|
||||
procedure Report(aCharsetID: eInternalCharsetID);
|
||||
function CheckBOM(aBuf: pAnsiChar; aLen: integer): integer;
|
||||
function GetCharsetID(CodePage: integer): eInternalCharsetID;
|
||||
procedure DoEnableCharset(Charset: eInternalCharsetID; SetEnabledTo: Boolean);
|
||||
public
|
||||
constructor Create;
|
||||
destructor Destroy; override;
|
||||
|
||||
procedure Reset;
|
||||
function HandleData(aBuf: PChar; aLen: integer): nsResult;
|
||||
procedure DataEnd;
|
||||
procedure Reset;
|
||||
function HandleData(aBuf: pAnsiChar; aLen: integer): nsResult;
|
||||
procedure DataEnd;
|
||||
|
||||
function GetDetectedCharsetInfo: nsCore.rCharsetInfo;
|
||||
function GetDetectedCharsetInfo: nsCore.rCharsetInfo;
|
||||
|
||||
function GetKnownCharset(out KnownCharsets: pChar): integer;
|
||||
procedure GetAbout(out About: rAboutHolder);
|
||||
procedure DisableCharset(CodePage: integer);
|
||||
function GetKnownCharset(out KnownCharsets: String): integer;
|
||||
procedure GetAbout(out About: rAboutHolder);
|
||||
procedure DisableCharset(CodePage: integer);
|
||||
|
||||
property Done: Boolean read mDone;
|
||||
property BOMDetected: eBOMKind read mDetectedBOM;
|
||||
end;
|
||||
property Done: Boolean read mDone;
|
||||
property BOMDetected: eBOMKind read mDetectedBOM;
|
||||
end;
|
||||
|
||||
implementation
|
||||
uses
|
||||
SysUtils,
|
||||
nsGroupProber,
|
||||
nsMBCSMultiProber,
|
||||
nsSBCSGroupProber,
|
||||
nsEscCharsetProber,
|
||||
nsLatin1Prober,
|
||||
nsMBCSMultiProber,
|
||||
nsSBCSGroupProber,
|
||||
nsEscCharsetProber,
|
||||
nsLatin1Prober,
|
||||
MBUnicodeMultiProber;
|
||||
|
||||
|
||||
const
|
||||
MINIMUM_THRESHOLD: float = 0.20;
|
||||
MINIMUM_THRESHOLD: float = 0.20;
|
||||
|
||||
AboutInfo: rAboutHolder = (
|
||||
MajorVersionNr: 0;
|
||||
MinorVersionNr: 2;
|
||||
BuildVersionNr: 6;
|
||||
About: 'Charset Detector Library. Copyright (C) 2006 - 2008, Nick Yakowlew. http://chsdet.sourceforge.net';
|
||||
BuildVersionNr: 8;
|
||||
About: 'Charset Detector Library. Copyright (C) 2006 - 2013, Nick Yakowlew. http://chsdet.sourceforge.net';
|
||||
);
|
||||
{ TnsUniversalDetector }
|
||||
{ TnsUniversalDetector }
|
||||
|
||||
constructor TnsUniversalDetector.Create;
|
||||
begin
|
||||
inherited Create;
|
||||
inherited Create;
|
||||
|
||||
mCharSetProbers[0] := TnsMBCSMultiProber.Create;
|
||||
mCharSetProbers[1] := TnsSBCSGroupProber.Create;
|
||||
mCharSetProbers[2] := TnsLatin1Prober.Create;
|
||||
mCharSetProbers[3] := TMBUnicodeMultiProber.Create;
|
||||
mEscCharSetProber := TnsEscCharSetProber.Create;
|
||||
mEscCharSetProber := TnsEscCharSetProber.Create;
|
||||
Reset;
|
||||
end;
|
||||
|
||||
|
|
@ -108,7 +107,7 @@ destructor TnsUniversalDetector.Destroy;
|
|||
var
|
||||
i: integer;
|
||||
begin
|
||||
for i := 0 to Pred(NUM_OF_CHARSET_PROBERS) do
|
||||
for i := 0 to Pred(NUM_OF_CHARSET_PROBERS) do
|
||||
mCharSetProbers[i].Free;
|
||||
|
||||
mEscCharSetProber.Free;
|
||||
|
|
@ -118,9 +117,9 @@ end;
|
|||
|
||||
procedure TnsUniversalDetector.DataEnd;
|
||||
var
|
||||
proberConfidence: float;
|
||||
proberConfidence: float;
|
||||
maxProberConfidence: float;
|
||||
maxProber: PRInt32;
|
||||
maxProber: int32;
|
||||
i: integer;
|
||||
begin
|
||||
if not mGotData then
|
||||
|
|
@ -135,7 +134,7 @@ begin
|
|||
exit;
|
||||
end;
|
||||
case mInputState of
|
||||
eHighbyte:
|
||||
isHighbyte:
|
||||
begin
|
||||
maxProberConfidence := 0.0;
|
||||
maxProber := 0;
|
||||
|
|
@ -143,72 +142,49 @@ begin
|
|||
begin
|
||||
proberConfidence := mCharSetProbers[i].GetConfidence;
|
||||
if proberConfidence > maxProberConfidence then
|
||||
begin
|
||||
maxProberConfidence := proberConfidence;
|
||||
maxProber := i;
|
||||
end;
|
||||
begin
|
||||
maxProberConfidence := proberConfidence;
|
||||
maxProber := i;
|
||||
end;
|
||||
end;
|
||||
(*do not report anything because we are not confident of it, that's in fact a negative answer*)
|
||||
if maxProberConfidence > MINIMUM_THRESHOLD then
|
||||
Report(mCharSetProbers[maxProber].GetDetectedCharset);
|
||||
Report(mCharSetProbers[maxProber].GetDetectedCharset);
|
||||
end;
|
||||
eEscAscii:
|
||||
begin
|
||||
isEscAscii:
|
||||
begin
|
||||
mDetectedCharset := mEscCharSetProber.GetDetectedCharset;
|
||||
end;
|
||||
else
|
||||
begin
|
||||
mDetectedCharset := PURE_ASCII_CHARSET;
|
||||
end;
|
||||
end;{case}
|
||||
{$ifdef DEBUG_chardet}
|
||||
else
|
||||
begin
|
||||
mDetectedCharset := PURE_ASCII_CHARSET;
|
||||
end;
|
||||
end; {case}
|
||||
{$IFDEF DEBUG_chardet}
|
||||
AddDump('Universal detector - DataEnd');
|
||||
{$endif}
|
||||
{$ENDIF}
|
||||
end;
|
||||
|
||||
function TnsUniversalDetector.HandleData(aBuf: PChar; aLen: integer): nsResult;
|
||||
function TnsUniversalDetector.HandleData(aBuf: pAnsiChar; aLen: integer): nsResult;
|
||||
var
|
||||
i: integer;
|
||||
st: eProbingState;
|
||||
// startAt: integer;
|
||||
//newBuf: pChar;
|
||||
//BufPtr: pChar;
|
||||
//b: integer;
|
||||
//tmpBOM: eBOMKind;
|
||||
begin
|
||||
// startAt := 0;
|
||||
if mDone then
|
||||
begin
|
||||
Result := NS_OK;
|
||||
exit;
|
||||
end;
|
||||
if aLen > 0 then
|
||||
mGotData := TRUE;
|
||||
mGotData := TRUE;
|
||||
|
||||
(*If the data starts with BOM, it should be Unicode, but we continue check*)
|
||||
|
||||
(*If the data starts with BOM, we know it is Unicode*)
|
||||
if mStart then
|
||||
begin
|
||||
mStart := FALSE;
|
||||
// startAt := CheckBOM(aBuf, aLen);
|
||||
CheckBOM(aBuf, aLen);
|
||||
// case mDetectedBOM of
|
||||
// BOM_UCS4_BE: mDetectedCharset := UCS4_BE_CHARSET;
|
||||
// BOM_UCS4_LE: mDetectedCharset := UCS4_LE_CHARSET;
|
||||
// BOM_UTF16_BE: mDetectedCharset := UTF16_BE_CHARSET;
|
||||
// BOM_UTF16_LE: mDetectedCharset := UTF16_LE_CHARSET;
|
||||
// BOM_UTF8: mDetectedCharset := UTF8_CHARSET;
|
||||
//
|
||||
// BOM_UCS4_2143: mDetectedCharset := UCS4_BE_CHARSET;
|
||||
// BOM_UCS4_3412: mDetectedCharset := UCS4_LE_CHARSET;
|
||||
// end;
|
||||
// TODO - some stuppid ASCII text can starts with BOM. What to do?
|
||||
if mDetectedCharset <> UNKNOWN_CHARSET then
|
||||
begin
|
||||
// mDone := TRUE;
|
||||
// Result := NS_OK;
|
||||
// exit;
|
||||
end;
|
||||
end; {if mStart}
|
||||
end; {if mStart}
|
||||
|
||||
for i := 0 to Pred(aLen) do
|
||||
(*other than 0xa0, if every othe character is ascii, the page is ascii*)
|
||||
|
|
@ -216,100 +192,70 @@ begin
|
|||
begin
|
||||
(*Since many Ascii only page contains NBSP *)
|
||||
(*we got a non-ascii byte (high-byte)*)
|
||||
if mInputState <> eHighbyte then
|
||||
if mInputState <> isHighbyte then
|
||||
begin
|
||||
(*adjust state*)
|
||||
mInputState := eHighbyte;
|
||||
mInputState := isHighbyte;
|
||||
end;
|
||||
end
|
||||
else
|
||||
begin
|
||||
(*ok, just pure ascii so *)
|
||||
if (mInputState = ePureAscii) and
|
||||
((aBuf[i] = #$1B) or
|
||||
(aBuf[i] = '{') and
|
||||
(mLastChar = '~')) then
|
||||
if (mInputState = isPureAscii) and
|
||||
((aBuf[i] = #$1B) or
|
||||
(aBuf[i] = '{') and
|
||||
(mLastChar = '~')) then
|
||||
(*found escape character or HZ "~{"*)
|
||||
mInputState := eEscAscii;
|
||||
mInputState := isEscAscii;
|
||||
|
||||
mLastChar := aBuf[i];
|
||||
end;
|
||||
|
||||
case mInputState of
|
||||
eEscAscii:
|
||||
isEscAscii:
|
||||
begin
|
||||
{$ifdef DEBUG_chardet}
|
||||
{$IFDEF DEBUG_chardet}
|
||||
AddDump('Universal detector - Escape Detector started');
|
||||
{$endif}
|
||||
st := mEscCharSetProber.HandleData(aBuf,aLen);
|
||||
{$ENDIF}
|
||||
st := mEscCharSetProber.HandleData(aBuf, aLen);
|
||||
if st = psFoundIt then
|
||||
begin
|
||||
mDone := TRUE;
|
||||
mDetectedCharset := mEscCharSetProber.GetDetectedCharset;
|
||||
end;
|
||||
end;
|
||||
eHighbyte:
|
||||
isHighbyte:
|
||||
begin
|
||||
{$ifdef DEBUG_chardet}
|
||||
{$IFDEF DEBUG_chardet}
|
||||
AddDump('Universal detector - HighByte Detector started');
|
||||
{$endif}
|
||||
{$ENDIF}
|
||||
for i := 0 to Pred(NUM_OF_CHARSET_PROBERS) do
|
||||
begin
|
||||
//newBuf := AllocMem(aLen+StartAt);
|
||||
//BufPtr := newBuf;
|
||||
//try
|
||||
//tmpBOM := BOM_Not_Found;
|
||||
//if mDetectedBOM = BOM_Not_Found then
|
||||
//begin
|
||||
////case mCharSetProbers[i].GetDetectedCharset of
|
||||
//// UTF16_BE_CHARSET: tmpBOM := BOM_UCS4_BE;
|
||||
//// UTF16_LE_CHARSET: tmpBOM := BOM_UCS4_LE;
|
||||
//// else
|
||||
//// tmpBOM := BOM_Not_Found;
|
||||
////end;
|
||||
//tmpBOM := BOM_UTF16_BE;
|
||||
//end;
|
||||
//for b:=0 to integer(KnownBOM[tmpBOM][0])-1 do
|
||||
//begin
|
||||
//BufPtr^ := KnownBOM[tmpBOM][b+1];
|
||||
//inc(BufPtr);
|
||||
//end;
|
||||
//
|
||||
//for b:=0 to aLen do
|
||||
//begin
|
||||
//BufPtr^ := aBuf[b];
|
||||
//inc(BufPtr);
|
||||
//end;
|
||||
st := mCharSetProbers[i].HandleData(aBuf,aLen);
|
||||
// st := mCharSetProbers[i].HandleData(newBuf,aLen+startAt);
|
||||
if st = psFoundIt then
|
||||
begin
|
||||
mDone:= TRUE;
|
||||
mDetectedCharset := mCharSetProbers[i].GetDetectedCharset;
|
||||
// Result := NS_OK;
|
||||
break;
|
||||
end;
|
||||
//finally
|
||||
//FreeMem(newBuf, aLen);
|
||||
//end;
|
||||
end;
|
||||
st := mCharSetProbers[i].HandleData(aBuf, aLen);
|
||||
if st = psFoundIt then
|
||||
begin
|
||||
mDone := TRUE;
|
||||
mDetectedCharset := mCharSetProbers[i].GetDetectedCharset;
|
||||
break;
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
else
|
||||
else
|
||||
(*pure ascii*)
|
||||
begin
|
||||
(*do nothing here*)
|
||||
end;
|
||||
end;{case}
|
||||
end; {case}
|
||||
Result := NS_OK;
|
||||
end;
|
||||
|
||||
procedure TnsUniversalDetector.Report(aCharsetID: eInternalCharsetID);
|
||||
begin
|
||||
|
||||
if (aCharsetID <> UNKNOWN_CHARSET) and
|
||||
(mDetectedCharset = UNKNOWN_CHARSET) then
|
||||
if (aCharsetID <> UNKNOWN_CHARSET) and
|
||||
(mDetectedCharset = UNKNOWN_CHARSET) then
|
||||
|
||||
mDetectedCharset := aCharsetID;
|
||||
mDetectedCharset := aCharsetID;
|
||||
end;
|
||||
|
||||
procedure TnsUniversalDetector.Reset;
|
||||
|
|
@ -320,11 +266,11 @@ begin
|
|||
mStart := TRUE;
|
||||
mDetectedCharset := UNKNOWN_CHARSET;
|
||||
mGotData := FALSE;
|
||||
mInputState := ePureAscii;
|
||||
mLastChar := #0; (*illegal value as signal*)
|
||||
mInputState := isPureAscii;
|
||||
mLastChar := #0; (*illegal value as signal*)
|
||||
mEscCharSetProber.Reset;
|
||||
for i := 0 to Pred(NUM_OF_CHARSET_PROBERS) do
|
||||
mCharSetProbers[i].Reset;
|
||||
mCharSetProbers[i].Reset;
|
||||
mDetectedBOM := BOM_Not_Found;
|
||||
end;
|
||||
|
||||
|
|
@ -333,18 +279,16 @@ begin
|
|||
Result := KNOWN_CHARSETS[mDetectedCharset];
|
||||
end;
|
||||
|
||||
function TnsUniversalDetector.GetKnownCharset(out KnownCharsets: pChar): integer;
|
||||
function TnsUniversalDetector.GetKnownCharset(out KnownCharsets: String): integer;
|
||||
var
|
||||
s: ANSIstring;
|
||||
i: integer;
|
||||
i: eInternalCharsetID;
|
||||
begin
|
||||
s := '';
|
||||
for i := integer(low(KNOWN_CHARSETS)) to integer(High(KNOWN_CHARSETS)) do
|
||||
s := s + #10 + KNOWN_CHARSETS[eInternalCharsetID(i)].Name +
|
||||
' - ' + inttostr(KNOWN_CHARSETS[eInternalCharsetID(i)].CodePage);
|
||||
KnownCharsets := '';
|
||||
for i := low(KNOWN_CHARSETS) to high(KNOWN_CHARSETS) do
|
||||
KnownCharsets := KnownCharsets + #10 + KNOWN_CHARSETS[i].Name +
|
||||
' - ' + IntToStr(KNOWN_CHARSETS[i].CodePage);
|
||||
|
||||
KnownCharsets := pChar(s);
|
||||
Result := Length(s);
|
||||
Result := Length(KnownCharsets);
|
||||
end;
|
||||
|
||||
procedure TnsUniversalDetector.GetAbout(out About: rAboutHolder);
|
||||
|
|
@ -352,30 +296,28 @@ begin
|
|||
About := AboutInfo;
|
||||
end;
|
||||
|
||||
function TnsUniversalDetector.CheckBOM(aBuf: pChar; aLen: integer): integer;
|
||||
function BOMLength(BOM: eBOMKind): integer;
|
||||
begin
|
||||
Result := integer(KnownBOM[BOM, 0]);
|
||||
end;
|
||||
function TnsUniversalDetector.CheckBOM(aBuf: pAnsiChar; aLen: integer): integer;
|
||||
var
|
||||
i, b: integer;
|
||||
Same: Boolean;
|
||||
bom: eBOMKind;
|
||||
i: integer;
|
||||
same: Boolean;
|
||||
begin
|
||||
Result := 0;
|
||||
for i := integer(low(KnownBOM))+1 to integer(high(KnownBOM)) do
|
||||
if aLen > BOMLength(eBOMKind(i)) then
|
||||
mDetectedBOM := BOM_Not_Found;
|
||||
for bom := Succ(low(eBOMKind)) to high(eBomKind) do
|
||||
if aLen > KNOWN_BOM[bom].Length then
|
||||
begin
|
||||
Same := true;
|
||||
for b := 0 to BOMLength(eBOMKind(i)) - 1 do
|
||||
if (aBuf[b] <> KnownBOM[eBOMKind(i), b+1]) then
|
||||
same := true;
|
||||
for i := 0 to KNOWN_BOM[bom].Length - 1 do
|
||||
if (aBuf[i] <> KNOWN_BOM[bom].BOM[i]) then
|
||||
begin
|
||||
Same := false;
|
||||
same := false;
|
||||
break;
|
||||
end;
|
||||
if Same then
|
||||
if same then
|
||||
begin
|
||||
mDetectedBOM := eBOMKind(i);
|
||||
Result := BOMLength(mDetectedBOM);
|
||||
mDetectedBOM := bom;
|
||||
Result := KNOWN_BOM[bom].Length;
|
||||
exit;
|
||||
end;
|
||||
end;
|
||||
|
|
@ -390,7 +332,7 @@ function TnsUniversalDetector.GetCharsetID(CodePage: integer): eInternalCharsetI
|
|||
var
|
||||
i: integer;
|
||||
begin
|
||||
for i := integer(low(KNOWN_CHARSETS))+1 to integer(high(KNOWN_CHARSETS)) do
|
||||
for i := integer(low(KNOWN_CHARSETS)) + 1 to integer(high(KNOWN_CHARSETS)) do
|
||||
if (KNOWN_CHARSETS[eInternalCharsetID(i)].CodePage = CodePage) then
|
||||
begin
|
||||
Result := eInternalCharsetID(i);
|
||||
|
|
@ -423,11 +365,7 @@ begin
|
|||
end;
|
||||
end;
|
||||
|
||||
end;
|
||||
end;
|
||||
|
||||
end.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue