UPD: Charset Detector

This commit is contained in:
Alexander Koblov 2019-06-09 12:15:08 +00:00
commit 877ea0897a
26 changed files with 539 additions and 1246 deletions

View file

@ -1,13 +1,13 @@
-----------Summary
Charset Detector - as the name says - is a stand alone executable module for automatic charset detection of a given text.
Charset Detector - as the name says - is a stand alone component for automatic charset detection of a given text.
It can be useful for internationalisation support in multilingual applications such as web-script editors or Unicode editors.
Given input buffer will be analysed to guess used encoding. The result can be used as control parameter for charset conversation procedure.
Charset Detector can be compiled (and hopefully used) for MS Windows (as dll - dynamic link library) or Linux.
Based on Mozilla's i18n component - http://www.mozilla.org/projects/intl/.
Based on Mozilla's i18n component - https://dxr.mozilla.org/mozilla/source/extensions/universalchardet/.
-----------State
Version 0.2.6 stable.
The latest version can be found at http://chsdet.sourceforge.net.
Version 0.2.9 stable.
The original version can be found at http://chsdet.sourceforge.net.
https://sourceforge.net/p/doublecmd/code/HEAD/tree/trunk/components/chsdet/.
-----------Requirements
Charset Detector doesn't need any external components.
@ -16,8 +16,8 @@ Charset Detector doesn't need any external components.
As result you will get guessed charset as MS Windows Code Page id and charset name.
-----------Licence
Charset Detector is open source project and distributed under Lesser GPL.
See the GNU Lesser General Public License for more details - http://www.opensource.org/licenses/lgpl-license.php
Charset Detector is open source project and distributed under GNU LGPL.
See the GNU Lesser General Public License for more details - https://opensource.org/licenses/LGPL-2.1
-----------Supported charsets
@ -37,11 +37,9 @@ See the GNU Lesser General Public License for more details - http://www.opensour
| 1255 | windows-1255 | |
| 10007 | x-mac-cyrillic | |
| 12000 | X-ISO-10646-UCS-4-2143 | |
| 12000 | UTF-32LE | MS Windows hasn't CP.|
| | | Try to use USC-4. |
| 12000 | UTF-32LE | |
| 12001 | X-ISO-10646-UCS-4-3412 | |
| 12001 | UTF-32BE | MS Windows hasn't CP.|
| | | Try to use USC-4. |
| 12001 | UTF-32BE | |
| 20866 | KOI8-R | |
| 28595 | ISO-8859-5 | |
| 28595 | ISO-8859-5 | |
@ -57,7 +55,7 @@ See the GNU Lesser General Public License for more details - http://www.opensour
| 54936 | GB18030 | |
| 65001 | UTF-8 | |
+-----------+---------------------------+------------------------+
-----------Types
Return values
@ -67,76 +65,29 @@ Return values
Returned types
rCharsetInfo = record
Name: pChar; // charset GNU canonical name
CodePage: integer; // MS Windows CodePage id
Language: pChar; //
Name: PAnsiChar; // Charset GNU canonical name
CodePage: Integer; // MS Windows CodePage ID
Language: PAnsiChar;
end;
rAboutHolder = record
MajorVersionNr: Cardinal; // Library's Major Version #
MinorVersionNr: Cardinal; // Library's Minor Version #
BuildVersionNr: Cardinal; // Library's Build/Release Version #
About: pChar; // Copyleft information;
-----------Usage sample
Below is a small usage sample in Free Pascal.
function DetectEncoding(const S: String): rCharsetInfo;
var
Detector: TnsUniversalDetector;
begin
Detector:= TnsUniversalDetector.Create;
try
Detector.Reset;
Detector.HandleData(PAnsiChar(S), Length(S));
if not Detector.Done then Detector.DataEnd;
Result:= Detector.GetDetectedCharsetInfo;
finally
FreeAndNil(Detector);
end;
end;
-----------Exported functions
procedure chsd_Reset; stdcall;
Reset Charset Detector state. Prepare to new analyse.
function chsd_HandleData(aBuf: PChar; aLen: integer): integer; stdcall;
Analyse given buffer.
Parameters
aBuf - pointer to buffer with text.
sLen - buffer length;
Return value
NS_ERROR_OUT_OF_MEMORY - failure. Unable to create internal objects.
NS_OK - success.
Note
Function can be called more that one time to continue guessing. Charset Detector
remember last state until chsd_Reset called.
function chsd_Done: Boolean; stdcall;
Return value
TRUE - Charset Detector is sure about text encoding.
FALSE - Overwise.
Note
If input buffer is smaller then 1K Charset Detector returns anyway FALSE.
procedure chsd_DataEnd; stdcall;
Signalise data end. If Charset Detector hasn't sure result (Done = FALSE)
the best guessed encoding will be set as result.
function chsd_GetDetectedCharset: rCharsetInfo; stdcall;
Returns guessed charset.
procedure chsd_GetKnownCharsets(var KnownCharsets: pChar);
Fills the parameter with all supported charsets in form
"CodePage - Name LineFeed".
procedure chsd_GetAbout(var About: rAboutHolder); stdcall;
Fills the parameter with version and copyleft information.
-----------Sample
The definition file "chsd_dll_intf.pas" can be found in the same direcory.
Bellow is small usage sample.
// WS: WideString; // Wide string which can be used in Unicode controls.
// Get encoding of some buffer
chsd_Reset;
chsd_HandleData(aBuf, aLen);
if not chsd_Done then
chsd_DataEnd;
ChSInfo := chsd_GetDetectedCharset();
// convert buffer to WideString
OutputLength := MultiByteToWideChar(ChSInfo.CodePage, 0, aBuf, aLen, nil, 0);
SetLength(WS, OutputLength);
MultiByteToWideChar(ChSInfo.CodePage, 0, aBuf, aLen, PWideChar(WS), OutputLength);
// If you using Unicode SynEdit
SynEdit.Lines.Text := WS;
Nikolaj Yakowlew © 2006-2008
Copyright (C) 2006-2013 Nikolaj Yakowlew
Copyright (C) 2011-2019 Alexander Koblov

View file

@ -1,70 +0,0 @@
// +----------------------------------------------------------------------+
// | chsdet - Charset Detector Library |
// +----------------------------------------------------------------------+
// | Copyright (C) 2006, Nick Yakowlew http://chsdet.sourceforge.net |
// +----------------------------------------------------------------------+
// | Based on Mozilla sources http://www.mozilla.org/projects/intl/ |
// +----------------------------------------------------------------------+
// | This library is free software; you can redistribute it and/or modify |
// | it under the terms of the GNU General Public License as published by |
// | the Free Software Foundation; either version 2 of the License, or |
// | (at your option) any later version. |
// | This library is distributed in the hope that it will be useful |
// | but WITHOUT ANY WARRANTY; without even the implied warranty of |
// | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
// | See the GNU Lesser General Public License for more details. |
// | http://www.opensource.org/licenses/lgpl-license.php |
// +----------------------------------------------------------------------+
//
// $Id: chsd_dll_intf.pas,v 1.4 2009/07/12 15:13:56 ya_nick Exp $
unit chsd_dll_intf;
interface
const
NS_OK = 0;
NS_ERROR_OUT_OF_MEMORY = $8007000e;
type
rCharsetInfo = record
Name: pChar;
CodePage: integer;
Language: pChar;
end;
prCharsetInfo = ^rCharsetInfo;
rAboutHolder = record
MajorVersionNr: Cardinal;
MinorVersionNr: Cardinal;
BuildVersionNr: Cardinal;
About: pChar;
end;
eBOMKind =(
BOM_Not_Found,
BOM_UCS4_BE, // 00 00 FE FF UCS-4, big-endian machine (1234 order)
BOM_UCS4_LE, // FF FE 00 00 UCS-4, little-endian machine (4321 order)
BOM_UCS4_2143, // 00 00 FF FE UCS-4, unusual octet order (2143)
BOM_UCS4_3412, // FE FF 00 00 UCS-4, unusual octet order (3412)
BOM_UTF16_BE, // FE FF ## ## UTF-16, big-endian
BOM_UTF16_LE, // FF FE ## ## UTF-16, little-endian
BOM_UTF8 // EF BB BF UTF-8
);
const
CharsetDetectorLibrary = 'chsdet.dll';
procedure csd_Reset; stdcall; external CharsetDetectorLibrary;
function csd_HandleData(aBuf: PChar; aLen: integer): integer; stdcall; external CharsetDetectorLibrary;
function csd_Done: Boolean; stdcall; external CharsetDetectorLibrary;
procedure csd_DataEnd; stdcall; external CharsetDetectorLibrary;
function csd_GetDetectedCharset: rCharsetInfo; stdcall; external CharsetDetectorLibrary;
procedure csd_GetKnownCharsets(var KnownCharsets: pChar); stdcall; external CharsetDetectorLibrary;
procedure csd_GetAbout(var About: rAboutHolder); stdcall; external CharsetDetectorLibrary;
function csd_GetDetectedBOM: eBOMKind; stdcall; external CharsetDetectorLibrary;
procedure csd_DisableCharsetCP(CodePage: integer); stdcall; external CharsetDetectorLibrary;
implementation
end.

View file

@ -1,227 +1,225 @@
<?xml version="1.0"?>
<CONFIG>
<Package Version="4">
<PathDelim Value="\"/>
<Name Value="chsdet"/>
<CompilerOptions>
<Version Value="11"/>
<PathDelim Value="\"/>
<SearchPaths>
<OtherUnitFiles Value="src;src\sbseq"/>
<UnitOutputDirectory Value="lib\$(TargetCPU)-$(TargetOS)"/>
</SearchPaths>
<CodeGeneration>
<Checks>
<RangeChecks Value="True"/>
<OverflowChecks Value="True"/>
</Checks>
</CodeGeneration>
<Linking>
<Debugging>
<DebugInfoType Value="dsDwarf2Set"/>
</Debugging>
</Linking>
<Other>
<CompilerPath Value="$(CompPath)"/>
</Other>
</CompilerOptions>
<Files Count="46">
<Item1>
<Filename Value="src\Big5Freq.pas"/>
<UnitName Value="Big5Freq"/>
</Item1>
<Item2>
<Filename Value="src\CharDistribution.pas"/>
<UnitName Value="CharDistribution"/>
</Item2>
<Item3>
<Filename Value="src\chsdIntf.pas"/>
<UnitName Value="chsdIntf"/>
</Item3>
<Item4>
<Filename Value="src\CustomDetector.pas"/>
<UnitName Value="CustomDetector"/>
</Item4>
<Item5>
<Filename Value="src\dbg.inc"/>
<Type Value="Include"/>
</Item5>
<Item6>
<Filename Value="src\EUCKRFreq.pas"/>
<UnitName Value="EUCKRFreq"/>
</Item6>
<Item7>
<Filename Value="src\EUCSampler.pas"/>
<UnitName Value="EUCSampler"/>
</Item7>
<Item8>
<Filename Value="src\EUCTWFreq.pas"/>
<UnitName Value="EUCTWFreq"/>
</Item8>
<Item9>
<Filename Value="src\GB2312Freq.pas"/>
<UnitName Value="GB2312Freq"/>
</Item9>
<Item10>
<Filename Value="src\JISFreq.pas"/>
<UnitName Value="JISFreq"/>
</Item10>
<Item11>
<Filename Value="src\JpCntx.pas"/>
<UnitName Value="JpCntx"/>
</Item11>
<Item12>
<Filename Value="src\mbclass\Big5LangModel.inc"/>
<Type Value="Include"/>
</Item12>
<Item13>
<Filename Value="src\mbclass\EUCJPLangModel.inc"/>
<Type Value="Include"/>
</Item13>
<Item14>
<Filename Value="src\mbclass\EUCKRLangModel.inc"/>
<Type Value="Include"/>
</Item14>
<Item15>
<Filename Value="src\mbclass\EUCTWLangModel.inc"/>
<Type Value="Include"/>
</Item15>
<Item16>
<Filename Value="src\mbclass\GB18030LangModel.inc"/>
<Type Value="Include"/>
</Item16>
<Item17>
<Filename Value="src\mbclass\HZLangModel.inc"/>
<Type Value="Include"/>
</Item17>
<Item18>
<Filename Value="src\mbclass\ISO2022CNLangModel.inc"/>
<Type Value="Include"/>
</Item18>
<Item19>
<Filename Value="src\mbclass\ISO2022JPLangModel.inc"/>
<Type Value="Include"/>
</Item19>
<Item20>
<Filename Value="src\mbclass\ISO2022KRLangModel.inc"/>
<Type Value="Include"/>
</Item20>
<Item21>
<Filename Value="src\mbclass\SJISLangModel.inc"/>
<Type Value="Include"/>
</Item21>
<Item22>
<Filename Value="src\mbclass\UCS2BELangModel.inc"/>
<Type Value="Include"/>
</Item22>
<Item23>
<Filename Value="src\mbclass\UCS2LELangModel.inc"/>
<Type Value="Include"/>
</Item23>
<Item24>
<Filename Value="src\mbclass\UTF8LangModel.inc"/>
<Type Value="Include"/>
</Item24>
<Item25>
<Filename Value="src\MBUnicodeMultiProber.pas"/>
<UnitName Value="MBUnicodeMultiProber"/>
</Item25>
<Item26>
<Filename Value="src\MultiModelProber.pas"/>
<UnitName Value="MultiModelProber"/>
</Item26>
<Item27>
<Filename Value="src\nsCodingStateMachine.pas"/>
<UnitName Value="nsCodingStateMachine"/>
</Item27>
<Item28>
<Filename Value="src\nsCore.pas"/>
<UnitName Value="nsCore"/>
</Item28>
<Item29>
<Filename Value="src\nsEscCharsetProber.pas"/>
<UnitName Value="nsEscCharsetProber"/>
</Item29>
<Item30>
<Filename Value="src\nsGroupProber.pas"/>
<UnitName Value="nsGroupProber"/>
</Item30>
<Item31>
<Filename Value="src\nsHebrewProber.pas"/>
<UnitName Value="nsHebrewProber"/>
</Item31>
<Item32>
<Filename Value="src\nsLatin1Prober.pas"/>
<UnitName Value="nsLatin1Prober"/>
</Item32>
<Item33>
<Filename Value="src\nsMBCSMultiProber.pas"/>
<UnitName Value="nsMBCSMultiProber"/>
</Item33>
<Item34>
<Filename Value="src\nsPkg.pas"/>
<UnitName Value="nsPkg"/>
</Item34>
<Item35>
<Filename Value="src\nsSBCharSetProber.pas"/>
<UnitName Value="nsSBCharSetProber"/>
</Item35>
<Item36>
<Filename Value="src\nsSBCSGroupProber.pas"/>
<UnitName Value="nsSBCSGroupProber"/>
</Item36>
<Item37>
<Filename Value="src\nsUniversalDetector.pas"/>
<UnitName Value="nsUniversalDetector"/>
</Item37>
<Item38>
<Filename Value="src\sbseq\LangBulgarianModel.pas"/>
<UnitName Value="LangBulgarianModel"/>
</Item38>
<Item39>
<Filename Value="src\sbseq\LangCyrillicModel.pas"/>
<UnitName Value="LangCyrillicModel"/>
</Item39>
<Item40>
<Filename Value="src\sbseq\LangGreekModel.pas"/>
<UnitName Value="LangGreekModel"/>
</Item40>
<Item41>
<Filename Value="src\sbseq\LangHebrewModel.pas"/>
<UnitName Value="LangHebrewModel"/>
</Item41>
<Item42>
<Filename Value="src\stat\Big5Statistics.inc"/>
<Type Value="Include"/>
</Item42>
<Item43>
<Filename Value="src\stat\EUCJPStatistics.inc"/>
<Type Value="Include"/>
</Item43>
<Item44>
<Filename Value="src\stat\EUCKRStatistics.inc"/>
<Type Value="Include"/>
</Item44>
<Item45>
<Filename Value="src\stat\EUCTWStatistics.inc"/>
<Type Value="Include"/>
</Item45>
<Item46>
<Filename Value="src\stat\GB2312Statistics.inc"/>
<Type Value="Include"/>
</Item46>
</Files>
<RequiredPkgs Count="1">
<Item1>
<PackageName Value="FCL"/>
<MinVersion Major="1" Valid="True"/>
</Item1>
</RequiredPkgs>
<UsageOptions>
<UnitPath Value="$(PkgOutDir)"/>
</UsageOptions>
<PublishOptions>
<Version Value="2"/>
</PublishOptions>
</Package>
</CONFIG>
<?xml version="1.0" encoding="UTF-8"?>
<CONFIG>
<Package Version="4">
<PathDelim Value="\"/>
<Name Value="chsdet"/>
<Author Value="Nikolaj Yakowlew, Alexander Koblov"/>
<CompilerOptions>
<Version Value="11"/>
<PathDelim Value="\"/>
<SearchPaths>
<OtherUnitFiles Value="src;src\sbseq"/>
<UnitOutputDirectory Value="lib\$(TargetCPU)-$(TargetOS)"/>
</SearchPaths>
<CodeGeneration>
<Checks>
<RangeChecks Value="True"/>
<OverflowChecks Value="True"/>
</Checks>
</CodeGeneration>
<Linking>
<Debugging>
<DebugInfoType Value="dsDwarf2Set"/>
</Debugging>
</Linking>
</CompilerOptions>
<Description Value="Charset Detector - as the name says - is a stand alone component for automatic charset detection of a given text.
Given input buffer will be analysed to guess used encoding. The result can be used as control parameter for charset conversation procedure."/>
<License Value="GNU LGPL-2.1"/>
<Version Minor="2" Release="9"/>
<Files Count="45">
<Item1>
<Filename Value="src\Big5Freq.pas"/>
<UnitName Value="Big5Freq"/>
</Item1>
<Item2>
<Filename Value="src\CharDistribution.pas"/>
<UnitName Value="CharDistribution"/>
</Item2>
<Item3>
<Filename Value="src\CustomDetector.pas"/>
<UnitName Value="CustomDetector"/>
</Item3>
<Item4>
<Filename Value="src\dbg.inc"/>
<Type Value="Include"/>
</Item4>
<Item5>
<Filename Value="src\EUCKRFreq.pas"/>
<UnitName Value="EUCKRFreq"/>
</Item5>
<Item6>
<Filename Value="src\EUCSampler.pas"/>
<UnitName Value="EUCSampler"/>
</Item6>
<Item7>
<Filename Value="src\EUCTWFreq.pas"/>
<UnitName Value="EUCTWFreq"/>
</Item7>
<Item8>
<Filename Value="src\GB2312Freq.pas"/>
<UnitName Value="GB2312Freq"/>
</Item8>
<Item9>
<Filename Value="src\JISFreq.pas"/>
<UnitName Value="JISFreq"/>
</Item9>
<Item10>
<Filename Value="src\JpCntx.pas"/>
<UnitName Value="JpCntx"/>
</Item10>
<Item11>
<Filename Value="src\mbclass\Big5LangModel.inc"/>
<Type Value="Include"/>
</Item11>
<Item12>
<Filename Value="src\mbclass\EUCJPLangModel.inc"/>
<Type Value="Include"/>
</Item12>
<Item13>
<Filename Value="src\mbclass\EUCKRLangModel.inc"/>
<Type Value="Include"/>
</Item13>
<Item14>
<Filename Value="src\mbclass\EUCTWLangModel.inc"/>
<Type Value="Include"/>
</Item14>
<Item15>
<Filename Value="src\mbclass\GB18030LangModel.inc"/>
<Type Value="Include"/>
</Item15>
<Item16>
<Filename Value="src\mbclass\HZLangModel.inc"/>
<Type Value="Include"/>
</Item16>
<Item17>
<Filename Value="src\mbclass\ISO2022CNLangModel.inc"/>
<Type Value="Include"/>
</Item17>
<Item18>
<Filename Value="src\mbclass\ISO2022JPLangModel.inc"/>
<Type Value="Include"/>
</Item18>
<Item19>
<Filename Value="src\mbclass\ISO2022KRLangModel.inc"/>
<Type Value="Include"/>
</Item19>
<Item20>
<Filename Value="src\mbclass\SJISLangModel.inc"/>
<Type Value="Include"/>
</Item20>
<Item21>
<Filename Value="src\mbclass\UCS2BELangModel.inc"/>
<Type Value="Include"/>
</Item21>
<Item22>
<Filename Value="src\mbclass\UCS2LELangModel.inc"/>
<Type Value="Include"/>
</Item22>
<Item23>
<Filename Value="src\mbclass\UTF8LangModel.inc"/>
<Type Value="Include"/>
</Item23>
<Item24>
<Filename Value="src\MBUnicodeMultiProber.pas"/>
<UnitName Value="MBUnicodeMultiProber"/>
</Item24>
<Item25>
<Filename Value="src\MultiModelProber.pas"/>
<UnitName Value="MultiModelProber"/>
</Item25>
<Item26>
<Filename Value="src\nsCodingStateMachine.pas"/>
<UnitName Value="nsCodingStateMachine"/>
</Item26>
<Item27>
<Filename Value="src\nsCore.pas"/>
<UnitName Value="nsCore"/>
</Item27>
<Item28>
<Filename Value="src\nsEscCharsetProber.pas"/>
<UnitName Value="nsEscCharsetProber"/>
</Item28>
<Item29>
<Filename Value="src\nsGroupProber.pas"/>
<UnitName Value="nsGroupProber"/>
</Item29>
<Item30>
<Filename Value="src\nsHebrewProber.pas"/>
<UnitName Value="nsHebrewProber"/>
</Item30>
<Item31>
<Filename Value="src\nsLatin1Prober.pas"/>
<UnitName Value="nsLatin1Prober"/>
</Item31>
<Item32>
<Filename Value="src\nsMBCSMultiProber.pas"/>
<UnitName Value="nsMBCSMultiProber"/>
</Item32>
<Item33>
<Filename Value="src\nsPkg.pas"/>
<UnitName Value="nsPkg"/>
</Item33>
<Item34>
<Filename Value="src\nsSBCharSetProber.pas"/>
<UnitName Value="nsSBCharSetProber"/>
</Item34>
<Item35>
<Filename Value="src\nsSBCSGroupProber.pas"/>
<UnitName Value="nsSBCSGroupProber"/>
</Item35>
<Item36>
<Filename Value="src\nsUniversalDetector.pas"/>
<UnitName Value="nsUniversalDetector"/>
</Item36>
<Item37>
<Filename Value="src\sbseq\LangBulgarianModel.pas"/>
<UnitName Value="LangBulgarianModel"/>
</Item37>
<Item38>
<Filename Value="src\sbseq\LangCyrillicModel.pas"/>
<UnitName Value="LangCyrillicModel"/>
</Item38>
<Item39>
<Filename Value="src\sbseq\LangGreekModel.pas"/>
<UnitName Value="LangGreekModel"/>
</Item39>
<Item40>
<Filename Value="src\sbseq\LangHebrewModel.pas"/>
<UnitName Value="LangHebrewModel"/>
</Item40>
<Item41>
<Filename Value="src\stat\Big5Statistics.inc"/>
<Type Value="Include"/>
</Item41>
<Item42>
<Filename Value="src\stat\EUCJPStatistics.inc"/>
<Type Value="Include"/>
</Item42>
<Item43>
<Filename Value="src\stat\EUCKRStatistics.inc"/>
<Type Value="Include"/>
</Item43>
<Item44>
<Filename Value="src\stat\EUCTWStatistics.inc"/>
<Type Value="Include"/>
</Item44>
<Item45>
<Filename Value="src\stat\GB2312Statistics.inc"/>
<Type Value="Include"/>
</Item45>
</Files>
<RequiredPkgs Count="1">
<Item1>
<PackageName Value="FCL"/>
<MinVersion Major="1" Valid="True"/>
</Item1>
</RequiredPkgs>
<UsageOptions>
<UnitPath Value="$(PkgOutDir)"/>
</UsageOptions>
<PublishOptions>
<Version Value="2"/>
</PublishOptions>
</Package>
</CONFIG>

View file

@ -7,7 +7,7 @@ unit chsdet;
interface
uses
Big5Freq, CharDistribution, chsdIntf, CustomDetector, EUCKRFreq, EUCSampler,
Big5Freq, CharDistribution, CustomDetector, EUCKRFreq, EUCSampler,
EUCTWFreq, GB2312Freq, JISFreq, JpCntx, MBUnicodeMultiProber,
MultiModelProber, nsCodingStateMachine, nsCore, nsEscCharsetProber,
nsGroupProber, nsHebrewProber, nsLatin1Prober, nsMBCSMultiProber, nsPkg,

View file

@ -16,7 +16,7 @@
// | http://www.opensource.org/licenses/lgpl-license.php |
// +----------------------------------------------------------------------+
//
// $Id: Big5Freq.pas,v 1.2 2007/05/20 15:46:02 ya_nick Exp $
// $Id: Big5Freq.pas,v 1.3 2013/04/23 19:47:10 ya_nick Exp $
unit Big5Freq;
@ -49,7 +49,7 @@ const
//Char to FreqOrder table ,
BIG5_TABLE_SIZE = 5376;
Big5CharToFreqOrder: array [0..BIG5_TABLE_SIZE-1] of PRInt16 =
Big5CharToFreqOrder: array [0..BIG5_TABLE_SIZE-1] of int16 =
(
1,1801,1506, 255,1431, 198, 9, 82, 6,5008, 177, 202,3681,1256,2821, 110, // 16
3814, 33,3274, 261, 76, 44,2114, 16,2946,2187,1176, 659,3971, 26,3451,2653, // 32
@ -933,4 +933,4 @@ const
****************************************************************************************)
);
implementation
end.
end.

View file

@ -16,7 +16,7 @@
// | http://www.opensource.org/licenses/lgpl-license.php |
// +----------------------------------------------------------------------+
//
// $Id: CharDistribution.pas,v 1.3 2007/05/26 13:09:38 ya_nick Exp $
// $Id: CharDistribution.pas,v 1.4 2013/04/23 19:47:10 ya_nick Exp $
unit CharDistribution;
@ -32,13 +32,13 @@ type
protected
//mDone: PRBool; (*If this flag is set to PR_TRUE, detection is done and conclusion has been made*)
// YaN: nice idea. Unfortunately is not implemented :((
mFreqChars: PRUint32; (*The number of characters whose frequency order is less than 512*)
mTotalChars: PRUint32; (*Total character encounted.*)
mFreqChars: uInt32; (*The number of characters whose frequency order is less than 512*)
mTotalChars: uInt32; (*Total character encounted.*)
mCharToFreqOrder: pPRInt16; (*Mapping table to get frequency order from char order
mCharToFreqOrder: pInt16; (*Mapping table to get frequency order from char order
(get from GetOrder())*)
mTableSize: PRUint32; (*Size of above table*)
mTableSize: uInt32; (*Size of above table*)
mTypicalDistributionRatio: double;(*This is a constant value varies from language to language,
it is used in calculating confidence.
See my paper for further detail.*)
@ -48,9 +48,9 @@ type
//we do not handle character base on its original encoding string, but
//convert this encoding string to a number, here called order.
//This allow multiple encoding of a language to share one frequency table
function GetOrder(str: PChar): PRInt32; virtual; abstract;
function GetOrder(str: pAnsiChar): int32; virtual; abstract;
(*feed a block of data and do distribution analysis*)
// function HandleData(const aBuf: PChar; aLen: PRUint32): eProbingState; virtual; abstract;
// function HandleData(const aBuf: pAnsiChar; aLen: uInt32): eProbingState; virtual; abstract;
public
destructor Destroy; override;
(*This function is for future extension.
@ -66,7 +66,7 @@ type
function GotEnoughData: Boolean;
(*Feed a character with known length*)
procedure HandleOneChar(aStr: PChar; aCharLen: PRUint32); virtual;
procedure HandleOneChar(aStr: pAnsiChar; aCharLen: uInt32); virtual;
end;
@ -76,7 +76,7 @@ type
(* second byte range: 0xa1 -- 0xfe*)
(*no validation needed here. State machine has done that*)
protected
function GetOrder(str: PChar): PRInt32; override;
function GetOrder(str: pAnsiChar): int32; override;
public
constructor Create; reintroduce;
end;
@ -87,7 +87,7 @@ type
(* second byte range: 0xa1 -- 0xfe*)
(*no validation needed here. State machine has done that*)
protected
function GetOrder(str: PChar): PRInt32; override;
function GetOrder(str: pAnsiChar): int32; override;
public
constructor Create; reintroduce;
end;
@ -98,7 +98,7 @@ type
(* second byte range: 0xa1 -- 0xfe*)
(*no validation needed here. State machine has done that*)
protected
function GetOrder(str: PChar): PRInt32; override;
function GetOrder(str: pAnsiChar): int32; override;
public
constructor Create; reintroduce;
end;
@ -109,7 +109,7 @@ type
(* second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe*)
(*no validation needed here. State machine has done that*)
protected
function GetOrder(str: PChar): PRInt32; override;
function GetOrder(str: pAnsiChar): int32; override;
public
constructor Create; reintroduce;
end;
@ -120,7 +120,7 @@ type
(* second byte range: 0x40 -- 0x7e, 0x81 -- oxfe*)
(*no validation needed here. State machine has done that*)
protected
function GetOrder(str: PChar): PRInt32; override;
function GetOrder(str: pAnsiChar): int32; override;
public
constructor Create; reintroduce;
end;
@ -131,7 +131,7 @@ type
(* second byte range: 0xa1 -- 0xfe*)
(*no validation needed here. State machine has done that*)
protected
function GetOrder(str: PChar): PRInt32; override;
function GetOrder(str: pAnsiChar): int32; override;
public
constructor Create; reintroduce;
end;
@ -151,7 +151,7 @@ begin
inherited;
end;
procedure TCharDistributionAnalysis.HandleOneChar(aStr: PChar; aCharLen: PRUint32);
procedure TCharDistributionAnalysis.HandleOneChar(aStr: pAnsiChar; aCharLen: uInt32);
var
order: integer;
begin
@ -165,7 +165,7 @@ begin
inc(mTotalChars); (*order is valid*)
if order < integer(mTableSize) then
begin
if 512 > aPRint16(mCharToFreqOrder)[order] then
if 512 > aInt16(mCharToFreqOrder)[order] then
inc(mFreqChars);
end;
end;
@ -213,7 +213,7 @@ begin
mTypicalDistributionRatio := EUCTW_TYPICAL_DISTRIBUTION_RATIO;
end;
function TEUCTWDistributionAnalysis.GetOrder(str: PChar): PRInt32;
function TEUCTWDistributionAnalysis.GetOrder(str: pAnsiChar): int32;
begin
if byte(str^) >= $c4 then
Result := 94 * (byte(str[0]) - $c4) + byte(str[1]) - byte($a1)
@ -229,7 +229,7 @@ begin
mTypicalDistributionRatio := EUCKR_TYPICAL_DISTRIBUTION_RATIO;
end;
function TEUCKRDistributionAnalysis.GetOrder(str: PChar): PRInt32;
function TEUCKRDistributionAnalysis.GetOrder(str: pAnsiChar): int32;
begin
if byte(str^) >= $b0 then
Result := 94 * (byte(str[0]) - $b0) + byte(str[1]) - $a1
@ -245,7 +245,7 @@ begin
mTypicalDistributionRatio := GB2312_TYPICAL_DISTRIBUTION_RATIO;
end;
function TGB2312DistributionAnalysis.GetOrder(str: PChar): PRInt32;
function TGB2312DistributionAnalysis.GetOrder(str: pAnsiChar): int32;
begin
if (byte(str[0]) >= $b0) and
(byte(str[1]) >= $a1) then
@ -262,7 +262,7 @@ begin
mTypicalDistributionRatio := BIG5_TYPICAL_DISTRIBUTION_RATIO;
end;
function TBig5DistributionAnalysis.GetOrder(str: PChar): PRInt32;
function TBig5DistributionAnalysis.GetOrder(str: pAnsiChar): int32;
begin
if byte(str[0]) >= $a4 then
begin
@ -283,9 +283,9 @@ begin
mTypicalDistributionRatio := JIS_TYPICAL_DISTRIBUTION_RATIO;
end;
function TSJISDistributionAnalysis.GetOrder(str: PChar): PRInt32;
function TSJISDistributionAnalysis.GetOrder(str: pAnsiChar): int32;
var
order: PRInt32;
order: int32;
begin
if (byte(str[0]) >= $81) and
(byte(str[0]) <= $9f) then
@ -313,7 +313,7 @@ begin
mTypicalDistributionRatio := JIS_TYPICAL_DISTRIBUTION_RATIO;
end;
function TEUCJPDistributionAnalysis.GetOrder(str: PChar): PRInt32;
function TEUCJPDistributionAnalysis.GetOrder(str: pAnsiChar): int32;
begin
if byte(str[0]) >= $a0 then
Result := 94 * (byte(str[0]) - $a1) + byte(str[1]) - $a1
@ -321,7 +321,4 @@ begin
Result:= -1;
end;
end.
end.

View file

@ -1,30 +1,28 @@
unit Dump;
interface
const
nl = #13#10;
var
DumpStr: string;
uses
Classes
;
procedure AddDump(Dump: string);
procedure ShowDump;
procedure SetDumpOutput(DumpOutput: TStrings);
implementation
uses
// Windows;
UNIT1;
var
_DumpOutput: TStrings = nil;
procedure SetDumpOutput(DumpOutput: TStrings);
begin
_DumpOutput := DumpOutput;
end;
procedure AddDump(Dump: string);
begin
UNIT1.Form1.Memo1.Lines.Add(Dump);
// DumpStr := DumpStr + Dump + nl;
end;
procedure ShowDump;
begin
// OutputDebugString(pChar(DumpStr));
// DumpStr := '';
if (_DumpOutput <> nil) then
_DumpOutput.Add(Dump);
end;
end.

View file

@ -16,7 +16,7 @@
// | http://www.opensource.org/licenses/lgpl-license.php |
// +----------------------------------------------------------------------+
//
// $Id: EUCKRFreq.pas,v 1.2 2007/05/20 15:46:03 ya_nick Exp $
// $Id: EUCKRFreq.pas,v 1.3 2013/04/23 19:47:10 ya_nick Exp $
unit EUCKRFreq;
interface
@ -45,7 +45,7 @@ const
EUCKR_TABLE_SIZE = 2352;
//Char to FreqOrder table ,
EUCKRCharToFreqOrder: array [0..EUCKR_TABLE_SIZE-1] of PRInt16 =
EUCKRCharToFreqOrder: array [0..EUCKR_TABLE_SIZE-1] of int16 =
(
13, 130, 120,1396, 481,1719,1720, 328, 609, 212,1721, 707, 400, 299,1722, 87,
1397,1723, 104, 536,1117,1203,1724,1267, 685,1268, 508,1725,1726,1727,1728,1398,
@ -603,4 +603,3 @@ const
);
implementation
end.

View file

@ -31,7 +31,7 @@ type
constructor Create;
destructor Destroy; override;
function Sample(aIn: pChar; aLen: integer): Boolean;
function Sample(aIn: pAnsiChar; aLen: integer): Boolean;
function GetSomeData: Boolean;
function EnoughData: Boolean;
procedure CalFreq;
@ -114,12 +114,12 @@ begin
end;
end;
function TEUCSampler.Sample(aIn: pChar; aLen: integer): Boolean;
function TEUCSampler.Sample(aIn: pAnsiChar; aLen: integer): Boolean;
const
MAX_LENGTH: integer = MaxInt;// $80000000;
var
i: integer;
p: pChar;
p: pAnsiChar;
begin
if (mState = 1) then
begin
@ -175,4 +175,4 @@ begin
Result := ( mState <> 1 );
end;
end.
end.

View file

@ -16,7 +16,7 @@
// | http://www.opensource.org/licenses/lgpl-license.php |
// +----------------------------------------------------------------------+
//
// $Id: EUCTWFreq.pas,v 1.2 2007/05/20 15:46:04 ya_nick Exp $
// $Id: EUCTWFreq.pas,v 1.3 2013/04/23 19:47:10 ya_nick Exp $
unit EUCTWFreq;
@ -51,7 +51,7 @@ const
//Char to FreqOrder table ,
EUCTW_TABLE_SIZE = 8102-2742+16;
EUCTWCharToFreqOrder: array [0..EUCTW_TABLE_SIZE-1] of PRInt16 =
EUCTWCharToFreqOrder: array [0..EUCTW_TABLE_SIZE-1] of int16 =
(
1,1800,1506, 255,1431, 198, 9, 82, 6,7310, 177, 202,3615,1256,2808, 110, // 2742
3735, 33,3241, 261, 76, 44,2113, 16,2931,2184,1176, 659,3868, 26,3404,2643, // 2758
@ -437,4 +437,4 @@ const
****************************************************************************************)
);
implementation
end.
end.

View file

@ -16,7 +16,7 @@
// | http://www.opensource.org/licenses/lgpl-license.php |
// +----------------------------------------------------------------------+
//
// $Id: GB2312Freq.pas,v 1.2 2007/05/20 15:46:04 ya_nick Exp $
// $Id: GB2312Freq.pas,v 1.3 2013/04/23 19:47:10 ya_nick Exp $
unit GB2312Freq;
@ -47,7 +47,7 @@ const
GB2312_TABLE_SIZE = 3760;
GB2312CharToFreqOrder: array [0..GB2312_TABLE_SIZE-1] of PRInt16 =
GB2312CharToFreqOrder: array [0..GB2312_TABLE_SIZE-1] of int16 =
(
1671, 749,1443,2364,3924,3807,2330,3921,1704,3463,2691,1511,1515, 572,3191,2205,
2361, 224,2558, 479,1711, 963,3162, 440,4060,1905,2966,2947,3580,2647,3961,3842,
@ -483,4 +483,3 @@ implementation
end.

View file

@ -16,7 +16,7 @@
// | http://www.opensource.org/licenses/lgpl-license.php |
// +----------------------------------------------------------------------+
//
// $Id: JISFreq.pas,v 1.2 2007/05/20 15:46:04 ya_nick Exp $
// $Id: JISFreq.pas,v 1.3 2013/04/23 19:47:10 ya_nick Exp $
unit JISFreq;
@ -49,7 +49,7 @@ const
//Char to FreqOrder table ,
JIS_TABLE_SIZE = 4368;
JISCharToFreqOrder: array [0..JIS_TABLE_SIZE-1] of PRInt16 =
JISCharToFreqOrder: array [0..JIS_TABLE_SIZE-1] of int16 =
(
40, 1, 6, 182, 152, 180, 295,2127, 285, 381,3295,4304,3068,4606,3165,3510, // 16
3511,1822,2785,4607,1193,2226,5070,4608, 171,2996,1247, 18, 179,5071, 856,1661, // 32
@ -577,4 +577,4 @@ const
);
implementation
end.
end.

View file

@ -16,13 +16,14 @@
// | http://www.opensource.org/licenses/lgpl-license.php |
// +----------------------------------------------------------------------+
//
// $Id: JpCntx.pas,v 1.2 2007/05/20 15:46:05 ya_nick Exp $
// $Id: JpCntx.pas,v 1.3 2013/04/23 19:47:10 ya_nick Exp $
unit JpCntx;
interface
uses
nsCore;
{$HINTS OFF} // Kylix gives an invalid unused hint for TJapaneseContextAnalysis.GetOrder(str: pAnsiChar)
const
NUM_OF_CATEGORY = 6;
@ -31,9 +32,9 @@ type
TJapaneseContextAnalysis = class (TObject)
private
(*category counters, each interger counts sequence in its category*)
mRelSample: array [0..Pred(NUM_OF_CATEGORY)] of PRUint32;
mRelSample: array [0..Pred(NUM_OF_CATEGORY)] of uInt32;
(*total sequence received*)
mTotalRel: PRUint32;
mTotalRel: uInt32;
(*The order of previous char*)
mLastCharOrder: integer;
(*if last byte in current buffer is not the last byte of a character, we*)
@ -42,29 +43,29 @@ type
(*If this flag is set to PR_TRUE, detection is done and conclusion has been made*)
mDone: Boolean;
function GetOrder(str: PChar; charLen: pPRUint32): PRInt32; overload; virtual; abstract;
function GetOrder(str: PChar): PRInt32; overload; virtual; abstract;
function GetOrder(str: pAnsiChar; charLen: puInt32): int32; overload; virtual; abstract;
function GetOrder(str: pAnsiChar): int32; overload; virtual; abstract;
public
constructor Create;
destructor Destroy; override;
procedure Reset;
procedure HandleData(const aBuf: PChar; aLen: integer);
procedure HandleOneChar(aStr: PChar; aCharLen: integer);
procedure HandleData(const aBuf: pAnsiChar; aLen: integer);
procedure HandleOneChar(aStr: pAnsiChar; aCharLen: integer);
function GotEnoughData: Boolean;
function GetConfidence: float;
end;
TSJISContextAnalysis = class (TJapaneseContextAnalysis)
public
function GetOrder(str: PChar; charLen: pPRUint32): PRInt32; overload; override;
function GetOrder(str: PChar): PRInt32; overload; override;
function GetOrder(str: pAnsiChar; charLen: puInt32): int32; overload; override;
function GetOrder(str: pAnsiChar): int32; overload; override;
end;
TEUCJPContextAnalysis = class (TJapaneseContextAnalysis)
public
function GetOrder(str: PChar; charLen: pPRUint32): PRInt32; overload; override;
function GetOrder(str: pAnsiChar; charLen: puInt32): int32; overload; override;
(*We only interested in Hiragana, so first byte is '\244'*)
function GetOrder(str: PChar): PRInt32; overload; override;
function GetOrder(str: pAnsiChar): int32; overload; override;
end;
implementation
@ -174,9 +175,9 @@ begin
inherited;
end;
procedure TJapaneseContextAnalysis.HandleOneChar(aStr: PChar; aCharLen: integer);
procedure TJapaneseContextAnalysis.HandleOneChar(aStr: pAnsiChar; aCharLen: integer);
var
order: PRInt32; (*if we received enough data, stop here *)
order: int32; (*if we received enough data, stop here *)
begin
if mTotalRel > MAX_REL_THRESHOLD then
mDone:= TRUE;
@ -210,10 +211,10 @@ begin
Result := DONT_KNOW;
end;
procedure TJapaneseContextAnalysis.HandleData(const aBuf: PChar; aLen: integer);
procedure TJapaneseContextAnalysis.HandleData(const aBuf: pAnsiChar; aLen: integer);
var
charLen: PRUint32;
order: PRInt32;
charLen: uInt32;
order: int32;
i: integer;
begin
if mDone then
@ -265,7 +266,7 @@ end;
{ TSJISContextAnalysis }
function TSJISContextAnalysis.GetOrder(str: PChar; charLen: pPRUint32): PRInt32;
function TSJISContextAnalysis.GetOrder(str: pAnsiChar; charLen: puInt32): int32;
begin
(*find out current char's byte length*)
if (byte(str^) >= $81) and
@ -284,7 +285,7 @@ begin
Result:= -1;
end;
function TSJISContextAnalysis.GetOrder(str: PChar): PRInt32;
function TSJISContextAnalysis.GetOrder(str: pAnsiChar): int32;
begin
(*We only interested in Hiragana, so first byte is '\202'*)
if (str[0]=#$82) and
@ -297,7 +298,7 @@ end;
{ TEUCJPContextAnalysis }
function TEUCJPContextAnalysis.GetOrder(str: PChar; charLen: pPRUint32): PRInt32;
function TEUCJPContextAnalysis.GetOrder(str: pAnsiChar; charLen: puInt32): int32;
begin
(*find out current char's byte length*)
if (byte(str^) = $8e) or
@ -318,7 +319,7 @@ begin
Result:= -1;
end;
function TEUCJPContextAnalysis.GetOrder(str: PChar): PRInt32;
function TEUCJPContextAnalysis.GetOrder(str: pAnsiChar): int32;
begin
if (str[0]=#$A4) and
(byte(str[1]) >= $a1) and
@ -328,4 +329,4 @@ begin
Result := -1;
end;
end.
end.

View file

@ -16,7 +16,7 @@
// | http://www.opensource.org/licenses/lgpl-license.php |
// +----------------------------------------------------------------------+
//
// $Id: MBUnicodeMultiProber.pas,v 1.2 2007/05/26 13:09:38 ya_nick Exp $
// $Id: MBUnicodeMultiProber.pas,v 1.3 2013/04/23 19:47:10 ya_nick Exp $
unit MBUnicodeMultiProber;
@ -32,7 +32,7 @@ type
public
constructor Create; reintroduce;
destructor Destroy; override;
function HandleData(aBuf: PChar; aLen: integer): eProbingState; override;
function HandleData(aBuf: pAnsiChar; aLen: integer): eProbingState; override;
// function GetConfidence: double; override;
end;
@ -50,6 +50,7 @@ uses
{ TMBUnicodeMultiProber }
const
NUM_OF_PROBERS = 3;
{$IFDEF FPC}{$NOTES OFF}{$ENDIF}
ONE_CHAR_PROB: float = 0.50;
{$ifdef DEBUG_chardet}
@ -69,11 +70,11 @@ begin
inherited;
end;
function TMBUnicodeMultiProber.HandleData(aBuf: PChar; aLen: integer): eProbingState;
function TMBUnicodeMultiProber.HandleData(aBuf: pAnsiChar; aLen: integer): eProbingState;
var
i: integer; (*do filtering to reduce load to probers*)
highbyteBuf: PChar;
hptr: PChar;
highbyteBuf: pAnsiChar;
hptr: pAnsiChar;
keepNext: Boolean;
begin
keepNext := TRUE;
@ -142,4 +143,4 @@ end;
// mDetectedCharset := UNKNOWN_CHARSET;
//end;
end.
end.

View file

@ -21,7 +21,7 @@ type
constructor Create; override;
destructor Destroy; override;
function HandleData(aBuf: PChar; aLen: integer): eProbingState; override;
function HandleData(aBuf: pAnsiChar; aLen: integer): eProbingState; override;
function GetDetectedCharset: eInternalCharsetID; override;
procedure Reset; override;
function EnableCharset(Charset: eInternalCharsetID; NewValue: Boolean): Boolean;
@ -76,7 +76,7 @@ begin
Result := mDetectedCharset;
end;
function TMultiModelProber.HandleData(aBuf: PChar; aLen: integer): eProbingState;
function TMultiModelProber.HandleData(aBuf: pAnsiChar; aLen: integer): eProbingState;
var
codingState: nsSMState;
j: integer;
@ -227,4 +227,3 @@ end;
{$endif}
end.

View file

@ -1,100 +0,0 @@
// +----------------------------------------------------------------------+
// | chsdet - Charset Detector Library |
// +----------------------------------------------------------------------+
// | Copyright (C) 2006, Nick Yakowlew http://chsdet.sourceforge.net |
// +----------------------------------------------------------------------+
// | Based on Mozilla sources http://www.mozilla.org/projects/intl/ |
// +----------------------------------------------------------------------+
// | This library is free software; you can redistribute it and/or modify |
// | it under the terms of the GNU General Public License as published by |
// | the Free Software Foundation; either version 2 of the License, or |
// | (at your option) any later version. |
// | This library is distributed in the hope that it will be useful |
// | but WITHOUT ANY WARRANTY; without even the implied warranty of |
// | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
// | See the GNU Lesser General Public License for more details. |
// | http://www.opensource.org/licenses/lgpl-license.php |
// +----------------------------------------------------------------------+
//
// $Id: chsdIntf.pas,v 1.4 2008/06/22 09:04:20 ya_nick Exp $
unit chsdIntf;
interface
uses
nsCore;
procedure csd_Reset; stdcall;
function csd_HandleData(aBuf: PChar; aLen: integer): integer; stdcall;
function csd_Done: boolean; stdcall;
procedure csd_DataEnd; stdcall;
function csd_GetDetectedCharset: rCharsetInfo; stdcall;
function csd_GetKnownCharsets(var KnownCharsets: pChar): integer; stdcall;
procedure csd_GetAbout(var About: rAboutHolder); stdcall;
function csd_GetDetectedBOM: eBOMKind; stdcall;
procedure csd_DisableCharsetCP(CodePage: integer); stdcall;
implementation
uses
nsUniversalDetector;
var
Detector: TnsUniversalDetector = nil;
procedure csd_Reset; stdcall;
begin
Detector.Reset;
end;
function csd_HandleData(aBuf: PChar; aLen: integer): integer; stdcall;
begin
Result := Detector.HandleData(aBuf, aLen);
end;
function csd_Done: boolean; stdcall;
begin
Result := Detector.Done;
end;
procedure csd_DataEnd; stdcall;
begin
Detector.DataEnd;
end;
function csd_GetDetectedCharset: rCharsetInfo; stdcall;
begin
Result := Detector.GetDetectedCharsetInfo;
end;
function csd_GetKnownCharsets(var KnownCharsets: pChar): integer; stdcall;
begin
Result := Detector.GetKnownCharset(KnownCharsets);
end;
procedure csd_GetAbout(var About: rAboutHolder); stdcall;
begin
Detector.GetAbout(About);
end;
function csd_GetDetectedBOM: eBOMKind; stdcall;
begin
Result := Detector.BOMDetected;
end;
procedure csd_DisableCharsetCP(CodePage: integer); stdcall;
begin
Detector.DisableCharset(CodePage);
end;
initialization
Detector := TnsUniversalDetector.Create;
finalization
if Detector <> nil then
Detector.Free;
end.

View file

@ -1,161 +0,0 @@
[FileVersion]
Version=6.0
[Compiler]
A=8
B=0
C=1
D=1
E=0
F=0
G=1
H=1
I=1
J=1
K=0
L=1
M=0
N=1
O=1
P=1
Q=0
R=0
S=0
T=0
U=1
V=1
W=0
X=1
Y=2
Z=1
ShowHints=1
ShowWarnings=1
UnitAliases=WinTypes=Windows;WinProcs=Windows;DbiTypes=BDE;DbiProcs=BDE;DbiErrs=BDE;
[Linker]
MapFile=0
OutputObjs=0
ConsoleApp=1
DebugInfo=0
RemoteSymbols=0
MinStackSize=16384
MaxStackSize=1048576
ImageBase=4194304
ExeDescription=
[Directories]
OutputDir=..\
UnitOutputDir=..\dcu
PackageDLLOutputDir=
PackageDCPOutputDir=
SearchPath=.\mbclass;.\sbseq;.\stat
Packages=VCL50;VCLX50;VCLSMP50;QRPT50;VCLDB50;VCLIE50;INETDB50;INET50;NMFAST50;dclocx50;dclaxserver50;DJCL50;JVAPPFRMD5R;JVCORED5R;JVBANDSD5R;JVDLGSD5R;JVCMPD5R;JVCRYPTD5R;JVCTRLSD5R;JVCUSTOMD5R;JVDOCKINGD5R;JVDOTNETCTRLSD5R;JVEDID5R;JVGLOBUSD5R;JVHMID5R;JVINSPECTORD5R;JVINTERPRETERD5R;JVJANSD5R;JVMANAGEDTHREADSD5R;JVMMD5R;JVNETD5R;JVSTDCTRLSD5R;JVPAGECOMPSD5R;JVPLUGIND5R;JVPRINTPREVIEWD5R;JVSYSTEMD5R;JVTIMEFRAMEWORKD5R;JVUIBD5R;JVVALIDATORSD5R;JVWIZARDD5R;JVXPCTRLSD5R;vcl
Conditionals=
DebugSourceDirs=
UsePackages=0
[Parameters]
RunParams=
HostApplication=
Launcher=
UseLauncher=0
DebugCWD=
[Language]
ActiveLang=
ProjectLang=$00000407
[Version Info]
IncludeVerInfo=1
AutoIncBuild=0
MajorVer=0
MinorVer=2
Release=6
Build=2
Debug=0
PreRelease=0
Special=0
Private=0
DLL=1
Locale=2057
CodePage=1252
[Version Info Keys]
CompanyName=
FileDescription=Charset detector
FileVersion=0.2.6.2
InternalName=
LegalCopyright=Nick Yakowlew, ya_nick@users.sourceforge.net
LegalTrademarks=
OriginalFilename=chsdet.dll
ProductName=Charset detector
ProductVersion=0.2
Comments=LGPL Licence
[Excluded Packages]
E:\Data\Yan\Delphi\log4delphi\bin\log4delphi_D6.bpl=Log4Delphi 0.5
c:\program files\borland\delphi6\Bin\DCLNMF60.bpl=NetMasters Fastnet Tools
C:\PROGRAM FILES\BORLAND\DELPHI6\PROJECTS\BPL\TBX_D6.BPL=Toolbar2000 -- TBX Extensions (Alex Denisov)
C:\PROGRAM FILES\BORLAND\DELPHI6\PROJECTS\BPL\TB2K_D6.BPL=Toolbar2000 Components (Jordan Russell)
C:\PROGRAM FILES\BORLAND\DELPHI6\PROJECTS\BPL\TBXDSGN_D6.BPL=Toolbar2000 -- TBX Extensions Design Package (Alex Denisov)
C:\PROGRAM FILES\BORLAND\DELPHI6\PROJECTS\BPL\TB2KDSGN_D6.BPL=Toolbar2000 Design Package (Jordan Russell)
c:\program files\borland\delphi6\Projects\Bpl\IEDcomp.bpl=Internet EDiting components
C:\PROGRAM FILES\BORLAND\DELPHI6\PROJECTS\BPL\TNTUNICODEVCL_D60.BPL=Tnt Unicode Controls
c:\program files\borland\delphi6\Projects\Bpl\SmpltCP.bpl=(untitled)
c:\program files\borland\delphi6\Projects\Bpl\devFileMonitorPkg.bpl=(untitled)
c:\program files\borland\delphi6\Bin\dclsoap60.bpl=Borland SOAP Components
c:\program files\borland\delphi6\Projects\Bpl\SpTBXLibDsgn_d6.bpl=Toolbar2000 -- SpTBXLib Design Package
c:\program files\borland\delphi6\Projects\Bpl\LSFindReplaceDialogW_6.bpl=LS Find/Replace Dialog for Wide Strings
c:\program files\borland\delphi6\Projects\Bpl\Unicode6.bpl=Unicode components
c:\program files\borland\delphi6\Projects\Bpl\credit.bpl=(untitled)
c:\program files\borland\delphi6\Projects\Bpl\pActivePorts.bpl=LGM ActivePorts Component
c:\program files\borland\delphi6\Projects\Bpl\USE.bpl=Unicode Syntax Edit control
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvAppFrmD6D.bpl=JVCL Application and Form Components
C:\PROGRAM FILES\BORLAND\DELPHI6\PROJECTS\BPL\JVCORED6D.BPL=JVCL Core Components
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvCmpD6D.bpl=JVCL Non-Visual Components
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvCryptD6D.bpl=JVCL Encryption and Compression Components
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvCtrlsD6D.bpl=JVCL Visual Controls
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvCustomD6D.bpl=JVCL Custom Controls
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvDlgsD6D.bpl=JVCL Dialog Components
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvDockingD6D.bpl=JVCL Docking Components
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvGlobusD6D.bpl=JVCL Globus Components
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvHMID6D.bpl=JVCL HMI Controls design time unit
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvJansD6D.bpl=JVCL Jans Components
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvManagedThreadsD6D.bpl=JVCL Managed Threads
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvMMD6D.bpl=JVCL Multimedia and Image Components
C:\PROGRAM FILES\BORLAND\DELPHI6\PROJECTS\BPL\JVSTDCTRLSD6D.BPL=JVCL Standard Controls
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvPageCompsD6D.bpl=JVCL Page Style Components
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvPluginD6D.bpl=JVCL Plugin Components
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvSystemD6D.bpl=JVCL System Components
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvTimeFrameworkD6D.bpl=JVCL Time Framework
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvValidatorsD6D.bpl=JVCL Validators and Error Provider Components
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvXPCtrlsD6D.bpl=JVCL XP Controls
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvBandsD6D.bpl=JVCL Band Objects
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvBDED6D.bpl=JVCL BDE Components
C:\PROGRAM FILES\BORLAND\DELPHI6\PROJECTS\BPL\JVDBD6D.BPL=JVCL Database Components
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvDotNetCtrlsD6D.bpl=JVCL DotNet Controls
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvEDID6D.bpl=JVCL EDI Components Designtime Package
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvInspectorD6D.bpl=JVCL Inspector Components
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvInterpreterD6D.bpl=JVCL Interpreter Components
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvNetD6D.bpl=JVCL Network Components
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvPrintPreviewD6D.bpl=JVCL Print Preview Components
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvUIBD6D.bpl=JVCL Unified Interbase Components
C:\Program Files\Borland\Delphi6\Projects\Bpl\JvWizardD6D.bpl=JVCL Wizard Design Time Package
c:\program files\borland\delphi6\Projects\Bpl\components.bpl=Components for tsWebEditor
c:\program files\borland\delphi6\Projects\Bpl\CoolTrayIcon_D6plus.bpl=CoolTrayIcon and Friends
C:\PROGRAM FILES\BORLAND\DELPHI6\BIN\DCLBDE60.BPL=Borland BDE DB Components
C:\PROGRAM FILES\BORLAND\DELPHI6\BIN\DBX60.BPL=Borland SQL Explorer UI Package
c:\program files\borland\delphi6\Projects\Bpl\ClassBrowsing.bpl=ClassBrowsing components
c:\program files\borland\delphi6\Bin\dclqrt60.bpl=QuickReport Components
c:\program files\borland\delphi6\Bin\dclcds60.bpl=Borland Base Cached ClientDataset Component
C:\PROGRAM FILES\BORLAND\DELPHI6\BIN\DCLMID60.BPL=Borland MyBase DataAccess Components
c:\program files\borland\delphi6\Bin\dclbdecds60.bpl=Borland Local BDE ClientDataset Components
c:\program files\borland\delphi6\Bin\dcltee60.bpl=TeeChart Components
c:\program files\borland\delphi6\Bin\dcltqr60.bpl=TeeChart for QuickReport Components
c:\program files\borland\delphi6\Bin\dclib60.bpl=InterBase Data Access Components
c:\program files\borland\delphi6\Bin\dcldbxcds60.bpl=Borland Local DBX ClientDataset Components
c:\program files\borland\delphi6\Bin\DBWEBXPRT.BPL=Borland Web Wizard Package
c:\program files\borland\delphi6\Projects\Bpl\prgInternet6.bpl=Progsan Internet Components
c:\program files\borland\delphi6\Projects\Bpl\Comps_D6.bpl=(untitled)
c:\program files\borland\delphi6\Projects\Bpl\SynEdit_D6.bpl=SynEdit component suite
c:\program files\borland\delphi6\Projects\Bpl\DevCpp.bpl=Dev-c++ components

View file

@ -1,38 +0,0 @@
// +----------------------------------------------------------------------+
// | chsdet - Charset Detector Library |
// +----------------------------------------------------------------------+
// | Copyright (C) 2006, Nick Yakowlew http://chsdet.sourceforge.net |
// +----------------------------------------------------------------------+
// | Based on Mozilla sources http://www.mozilla.org/projects/intl/ |
// +----------------------------------------------------------------------+
// | This library is free software; you can redistribute it and/or modify |
// | it under the terms of the GNU General Public License as published by |
// | the Free Software Foundation; either version 2 of the License, or |
// | (at your option) any later version. |
// | This library is distributed in the hope that it will be useful |
// | but WITHOUT ANY WARRANTY; without even the implied warranty of |
// | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
// | See the GNU Lesser General Public License for more details. |
// | http://www.opensource.org/licenses/lgpl-license.php |
// +----------------------------------------------------------------------+
//
// $Id: chsdet.dpr,v 1.3 2007/05/26 13:07:21 ya_nick Exp $
library chsdet;
uses
chsdIntf in 'chsdIntf.pas';
exports
csd_Reset,
csd_HandleData,
csd_Done,
csd_DataEnd,
csd_GetDetectedCharset,
csd_GetKnownCharsets,
csd_GetAbout;
{$R *.res}
begin
end.

View file

@ -1,172 +0,0 @@
<?xml version="1.0"?>
<CONFIG>
<ProjectOptions>
<PathDelim Value="/"/>
<Version Value="6"/>
<General>
<Flags>
<MainUnitHasUsesSectionForAllUnits Value="False"/>
<MainUnitHasCreateFormStatements Value="False"/>
<MainUnitHasTitleStatement Value="False"/>
<Runnable Value="False"/>
</Flags>
<MainUnit Value="0"/>
<IconPath Value="./"/>
<TargetFileExt Value=""/>
<UseAppBundle Value="False"/>
<ActiveEditorIndexAtStart Value="0"/>
</General>
<VersionInfo>
<UseVersionInfo Value="True"/>
<ProjectVersion Value=""/>
</VersionInfo>
<PublishOptions>
<Version Value="2"/>
<IgnoreBinaries Value="False"/>
<IncludeFileFilter Value="*.(pas|pp|inc|lfm|lpr|lrs|lpi|lpk|sh|xml)"/>
<ExcludeFileFilter Value="*.(bak|ppu|ppw|o|so);*~;backup"/>
</PublishOptions>
<RunParams>
<local>
<FormatVersion Value="1"/>
<LaunchingApplication PathPlusParams="/usr/X11R6/bin/xterm -T 'Lazarus Run Output' -e $(LazarusDir)/tools/runwait.sh $(TargetCmdLine)"/>
</local>
</RunParams>
<RequiredPackages Count="1">
<Item1>
<PackageName Value="LCL"/>
</Item1>
</RequiredPackages>
<Units Count="10">
<Unit0>
<Filename Value="chsdet.lpr"/>
<IsPartOfProject Value="True"/>
<UnitName Value="chsdet"/>
<CursorPos X="9" Y="38"/>
<TopLine Value="1"/>
<EditorIndex Value="0"/>
<UsageCount Value="20"/>
<Loaded Value="True"/>
</Unit0>
<Unit1>
<Filename Value=""/>
<UsageCount Value="10"/>
</Unit1>
<Unit2>
<Filename Value="nsSBCharSetProber.pas"/>
<UnitName Value="nsSBCharSetProber"/>
<CursorPos X="1" Y="24"/>
<TopLine Value="12"/>
<EditorIndex Value="2"/>
<UsageCount Value="10"/>
<Loaded Value="True"/>
</Unit2>
<Unit3>
<Filename Value="sbseq/LangHebrewModel.pas"/>
<IsPartOfProject Value="True"/>
<UsageCount Value="20"/>
<SyntaxHighlighter Value="Text"/>
</Unit3>
<Unit4>
<Filename Value="sbseq/LangBulgarianModel.pas"/>
<IsPartOfProject Value="True"/>
<UsageCount Value="20"/>
<SyntaxHighlighter Value="Text"/>
</Unit4>
<Unit5>
<Filename Value="sbseq/LangCyrillicModel.pas"/>
<IsPartOfProject Value="True"/>
<UsageCount Value="20"/>
<SyntaxHighlighter Value="Text"/>
</Unit5>
<Unit6>
<Filename Value="sbseq/LangGreekModel.pas"/>
<IsPartOfProject Value="True"/>
<UsageCount Value="20"/>
<SyntaxHighlighter Value="Text"/>
</Unit6>
<Unit7>
<Filename Value="chsdIntf.pas"/>
<UnitName Value="chsdIntf"/>
<CursorPos X="13" Y="31"/>
<TopLine Value="9"/>
<EditorIndex Value="1"/>
<UsageCount Value="10"/>
<Loaded Value="True"/>
</Unit7>
<Unit8>
<Filename Value="nsCore.pas"/>
<UnitName Value="nsCore"/>
<CursorPos X="1" Y="1"/>
<TopLine Value="17"/>
<UsageCount Value="10"/>
</Unit8>
<Unit9>
<Filename Value="dbg.inc"/>
<CursorPos X="1" Y="1"/>
<TopLine Value="1"/>
<UsageCount Value="10"/>
</Unit9>
</Units>
<JumpHistory Count="6" HistoryIndex="5">
<Position1>
<Filename Value="nsSBCharSetProber.pas"/>
<Caret Line="46" Column="19" TopLine="24"/>
</Position1>
<Position2>
<Filename Value="nsSBCharSetProber.pas"/>
<Caret Line="28" Column="10" TopLine="12"/>
</Position2>
<Position3>
<Filename Value="nsSBCharSetProber.pas"/>
<Caret Line="24" Column="1" TopLine="24"/>
</Position3>
<Position4>
<Filename Value="nsSBCharSetProber.pas"/>
<Caret Line="46" Column="19" TopLine="24"/>
</Position4>
<Position5>
<Filename Value="chsdet.lpr"/>
<Caret Line="35" Column="17" TopLine="1"/>
</Position5>
<Position6>
<Filename Value="chsdet.lpr"/>
<Caret Line="32" Column="12" TopLine="1"/>
</Position6>
</JumpHistory>
</ProjectOptions>
<CompilerOptions>
<Version Value="5"/>
<SearchPaths>
<UnitOutputDirectory Value="/home/yan/chsdet/dcu"/>
</SearchPaths>
<Parsing>
<SyntaxOptions>
<DelphiCompat Value="True"/>
</SyntaxOptions>
</Parsing>
<CodeGeneration>
<SmartLinkUnit Value="True"/>
<Generate Value="Faster"/>
<TargetProcessor Value="3"/>
<TargetCPU Value="i386"/>
<TargetOS Value="Linux"/>
</CodeGeneration>
<Linking>
<LinkSmart Value="True"/>
</Linking>
<Other>
<CompilerPath Value="$(CompPath)"/>
</Other>
</CompilerOptions>
<Debugging>
<Exceptions Count="2">
<Item1>
<Name Value="ECodetoolError"/>
</Item1>
<Item2>
<Name Value="EFOpenError"/>
</Item2>
</Exceptions>
</Debugging>
</CONFIG>

View file

@ -1,44 +0,0 @@
// +----------------------------------------------------------------------+
// | chsdet - Charset Detector Library |
// +----------------------------------------------------------------------+
// | Copyright (C) 2006, Nick Yakowlew http://chsdet.sourceforge.net |
// +----------------------------------------------------------------------+
// | Based on Mozilla sources http://www.mozilla.org/projects/intl/ |
// +----------------------------------------------------------------------+
// | This library is free software; you can redistribute it and/or modify |
// | it under the terms of the GNU General Public License as published by |
// | the Free Software Foundation; either version 2 of the License, or |
// | (at your option) any later version. |
// | This library is distributed in the hope that it will be useful |
// | but WITHOUT ANY WARRANTY; without even the implied warranty of |
// | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
// | See the GNU Lesser General Public License for more details. |
// | http://www.opensource.org/licenses/lgpl-license.php |
// +----------------------------------------------------------------------+
//
// $Id: chsdet.lpr,v 1.1 2009/03/20 17:40:22 ya_nick Exp $
library chsdet;
uses
chsdIntf in 'chsdIntf.pas',
LangBulgarianModel in 'sbseq/LangBulgarianModel.pas',
LangCyrillicModel in 'sbseq/LangCyrillicModel.pas',
LangGreekModel in 'sbseq/LangGreekModel.pas',
LangHebrewModel in 'sbseq/LangHebrewModel.pas' ;
exports
csd_Reset,
csd_HandleData,
csd_Done,
csd_DataEnd,
csd_GetDetectedCharset,
csd_GetKnownCharsets,
csd_GetAbout;
{.chsdet$R *.res}
{.chsdet$R chsdet.res}
begin
end.

View file

@ -16,7 +16,7 @@
// | http://www.opensource.org/licenses/lgpl-license.php |
// +----------------------------------------------------------------------+
//
// $Id: nsCodingStateMachine.pas,v 1.3 2007/05/26 13:09:38 ya_nick Exp $
// $Id: nsCodingStateMachine.pas,v 1.4 2013/04/23 19:47:10 ya_nick Exp $
unit nsCodingStateMachine;
@ -38,9 +38,9 @@ type
type
SMModel = record
classTable: Pointer; //nsPkgInt;
classFactor: PRUint32;
classFactor: uInt32;
stateTable: Pointer; //nsPkgInt;
charLenTable: Pointer; // pByteArray; // array of byte; // pPRUint32;
charLenTable: Pointer; // aByteArray; // array of byte; // puInt32;
CharsetID: eInternalCharsetID;
end;
pSMModel = ^SMModel;
@ -49,16 +49,16 @@ type
TnsCodingStateMachine = class (TObject)
protected
mCurrentState: nsSMState;
mCurrentCharLen: PRUint32;
mCurrentBytePos: PRUint32;
mCurrentCharLen: uInt32;
mCurrentBytePos: uInt32;
mModel: SMModel;
public
Enabled: Boolean;
constructor Create(sm: SMModel);
destructor Destroy; override;
function NextState(c: char): nsSMState;
function GetCurrentCharLen: PRUint32;
function NextState(c: AnsiChar): nsSMState;
function GetCurrentCharLen: uInt32;
procedure Reset;
function GetCharsetID: eInternalCharsetID;
@ -85,9 +85,9 @@ begin
inherited;
end;
function TnsCodingStateMachine.NextState(c: char): nsSMState;
function TnsCodingStateMachine.NextState(c: AnsiChar): nsSMState;
var
byteCls: PRUint32;
byteCls: uInt32;
begin
if not Enabled then
begin
@ -95,14 +95,14 @@ begin
exit;
end;
(*for each byte we get its class , if it is first byte, we also get byte length*)
byteCls := pByteArray(mModel.classTable)[integer(c)];
byteCls := aByteArray(mModel.classTable)[integer(c)];
if mCurrentState = eStart then
begin
mCurrentBytePos := 0;
mCurrentCharLen := pByteArray(mModel.charLenTable)[byteCls];
mCurrentCharLen := aByteArray(mModel.charLenTable)[byteCls];
end;
(*from byte's class and stateTable, we get its next state*)
mCurrentState := nsSMState(pByteArray(mModel.stateTable)[cardinal(mCurrentState) * mModel.classFactor + byteCls]);
mCurrentState := nsSMState(aByteArray(mModel.stateTable)[cardinal(mCurrentState) * mModel.classFactor + byteCls]);
inc(mCurrentBytePos);
//if mCurrentBytePos > mCurrentCharLen then
@ -111,7 +111,7 @@ begin
Result:= mCurrentState;
end;
function TnsCodingStateMachine.GetCurrentCharLen: PRUint32;
function TnsCodingStateMachine.GetCurrentCharLen: uInt32;
begin
Result:= mCurrentCharLen;
end;
@ -126,4 +126,4 @@ begin
Result:= mModel.CharsetID;
end;
end.
end.

View file

@ -16,24 +16,23 @@
// | http://www.opensource.org/licenses/lgpl-license.php |
// +----------------------------------------------------------------------+
//
// $Id: nsCore.pas,v 1.4 2008/06/22 09:04:20 ya_nick Exp $
// $Id: nsCore.pas,v 1.5 2013/04/23 19:47:10 ya_nick Exp $
unit nsCore;
interface
type
PRInt16 = smallint;
PRUint16 = word;
PRInt32 = integer;
PRUint32 = cardinal;
int16 = smallint;
int32 = integer;
uInt32 = cardinal;
pByteArray = array of Byte;
pPRUint32 = ^PRUint32;
aPRUint32 = array of PRUint32;
aByteArray = array of Byte;
puInt32 = ^uInt32;
auInt32 = array of uInt32;
pPRint16 = ^PRint16;
aPRint16 = array of PRint16;
pInt16 = ^int16;
aInt16 = array of int16;
const
SURE_YES: double = 0.99;
@ -50,7 +49,7 @@ type
);
type
nsResult = PRUint32;
nsResult = uInt32;
const
NS_OK = 0;
NS_ERROR_OUT_OF_MEMORY = $8007000e;
@ -76,25 +75,28 @@ type
BOM_UTF8 // EF BB BF UTF-8
);
rBOMDef = record
Length: integer;
BOM: array [0..3] of AnsiChar;
end;
const
KnownBOM: array [eBOMKind] of array [0..4] of Char = (
// first element = byte count
(#$00, #$00, #$00, #$00, #$00),
(#$04, #$00, #$00, #$FE, #$FF),
(#$04, #$FF, #$FE, #$00, #$00),
(#$04, #$00, #$00, #$FF, #$FE),
(#$04, #$FE, #$FF, #$00, #$00),
(#$02, #$FE, #$FF, #$00, #$00),
(#$02, #$FF, #$FE, #$00, #$00),
(#$03, #$EF, #$BB, #$BF, #$00)
KNOWN_BOM: array [eBOMKind] of rBOMDef = (
(Length: 00; BOM: (#$00, #$00, #$00, #$00)),
(Length: 04; BOM: (#$00, #$00, #$FE, #$FF)),
(Length: 04; BOM: (#$FF, #$FE, #$00, #$00)),
(Length: 04; BOM: (#$00, #$00, #$FF, #$FE)),
(Length: 04; BOM: (#$FE, #$FF, #$00, #$00)),
(Length: 02; BOM: (#$FE, #$FF, #$00, #$00)),
(Length: 02; BOM: (#$FF, #$FE, #$00, #$00)),
(Length: 03; BOM: (#$EF, #$BB, #$BF, #$00))
);
// "extended" charset info
type
rCharsetInfo = record
Name: pChar;
CodePage: integer;
Language: pChar;
rCharsetInfo = record
Name: PAnsiChar;
CodePage: Integer;
Language: PAnsiChar;
end;
eInternalCharsetID = (
@ -333,16 +335,16 @@ const
(* both functions Allocate a new buffer for newBuf. This buffer should be *)
(* freed by the caller using PR_FREEIF.*)
(* Both functions return PR_FALSE in case of memory allocation failure.*)
function FilterWithoutEnglishLetters(aBuf: PChar; aLen: integer; var newBuf: PChar; var newLen: integer): Boolean;
function FilterWithEnglishLetters(aBuf: PChar; aLen: integer; var newBuf: PChar; var newLen: integer): Boolean;
function FilterWithoutEnglishLetters(aBuf: pAnsiChar; aLen: integer; var newBuf: pAnsiChar; var newLen: integer): Boolean;
function FilterWithEnglishLetters(aBuf: pAnsiChar; aLen: integer; var newBuf: pAnsiChar; var newLen: integer): Boolean;
implementation
function FilterWithEnglishLetters(aBuf: PChar;
aLen: integer; var newBuf: PChar; var newLen: integer): Boolean;
function FilterWithEnglishLetters(aBuf: pAnsiChar;
aLen: integer; var newBuf: pAnsiChar; var newLen: integer): Boolean;
var
newptr: pChar;
prevPtr: pChar;
curPtr: pChar;
newptr: pAnsiChar;
prevPtr: pAnsiChar;
curPtr: pAnsiChar;
isInTag: Boolean;
begin
//do filtering to reduce load to probers
@ -403,12 +405,12 @@ begin
Result := TRUE;
end;
function FilterWithoutEnglishLetters(aBuf: PChar;
aLen: integer; var newBuf: PChar; var newLen: integer): Boolean;
function FilterWithoutEnglishLetters(aBuf: pAnsiChar;
aLen: integer; var newBuf: pAnsiChar; var newLen: integer): Boolean;
var
newPtr: pChar;
prevPtr: pChar;
curPtr: pChar;
newPtr: pAnsiChar;
prevPtr: pAnsiChar;
curPtr: pAnsiChar;
meetMSB: Boolean;
begin
(*This filter applies to all scripts which do not use English characters*)
@ -464,8 +466,3 @@ begin
end;
end.

View file

@ -16,7 +16,7 @@
// | http://www.opensource.org/licenses/lgpl-license.php |
// +----------------------------------------------------------------------+
//
// $Id: nsEscCharsetProber.pas,v 1.3 2007/05/26 13:09:38 ya_nick Exp $
// $Id: nsEscCharsetProber.pas,v 1.4 2013/04/23 19:47:10 ya_nick Exp $
unit nsEscCharsetProber;

View file

@ -16,7 +16,7 @@
// | http://www.opensource.org/licenses/lgpl-license.php |
// +----------------------------------------------------------------------+
//
// $Id: nsMBCSMultiProber.pas,v 1.2 2007/05/26 13:09:38 ya_nick Exp $
// $Id: nsMBCSMultiProber.pas,v 1.3 2013/04/23 19:47:10 ya_nick Exp $
unit nsMBCSMultiProber;
@ -36,12 +36,12 @@ type
mContextAnalysis: array of TJapaneseContextAnalysis;
mBestGuess: integer;
function RunStatAnalyse(aBuf: PChar; aLen: integer): eProbingState;
function RunStatAnalyse(aBuf: pAnsiChar; aLen: integer): eProbingState;
function GetConfidenceFor(index: integer): double; reintroduce;
public
constructor Create; reintroduce;
destructor Destroy; override;
function HandleData(aBuf: PChar; aLen: integer): eProbingState; override;
function HandleData(aBuf: pAnsiChar; aLen: integer): eProbingState; override;
function GetConfidence: double; override;
procedure Reset; override;
{$ifdef DEBUG_chardet}
@ -141,11 +141,11 @@ begin
end;
{$endif}
function TnsMBCSMultiProber.HandleData(aBuf: PChar; aLen: integer): eProbingState;
function TnsMBCSMultiProber.HandleData(aBuf: pAnsiChar; aLen: integer): eProbingState;
var
i: integer; (*do filtering to reduce load to probers*)
highbyteBuf: PChar;
hptr: PChar;
highbyteBuf: pAnsiChar;
hptr: pAnsiChar;
keepNext: Boolean;
begin
keepNext := TRUE;
@ -197,12 +197,12 @@ begin
Result := mState;
end;
function TnsMBCSMultiProber.RunStatAnalyse(aBuf: PChar; aLen: integer): eProbingState;
function TnsMBCSMultiProber.RunStatAnalyse(aBuf: pAnsiChar; aLen: integer): eProbingState;
var
i, c: integer;
codingState: nsSMState;
charLen: byte;
mLastChar: array [0..1] of Char;
mLastChar: array [0..1] of AnsiChar;
begin
{$IFDEF DEBUG_chardet}
AddDump('MultiByte - Stat Analyse - start');
@ -313,4 +313,4 @@ begin
end;
end;
end.
end.

View file

@ -16,7 +16,7 @@
// | http://www.opensource.org/licenses/lgpl-license.php |
// +----------------------------------------------------------------------+
//
// $Id: nsPkg.pas,v 1.2 2007/05/20 15:46:11 ya_nick Exp $
// $Id: nsPkg.pas,v 1.3 2013/04/23 19:47:10 ya_nick Exp $
unit nsPkg;
@ -59,7 +59,7 @@ type
sftmsk: nsSftMsk;
bitsft: nsBitSft;
unitmsk: nsUnitMsk;
data: pPRUint32;
data: puInt32;
end;
pnsPkgInt = ^nsPkgInt;
@ -88,7 +88,7 @@ end;
function GETFROMPCK(i: integer; c: pnsPkgInt): integer;
begin
Result:= (((aPRUint32(c^.data)[i shr integer(c^.idxsft)]) shr (i and integer(c^.sftmsk) shl integer(c^.bitsft))) and integer(c^.unitmsk));
Result:= (((auInt32(c^.data)[i shr integer(c^.idxsft)]) shr (i and integer(c^.sftmsk) shl integer(c^.bitsft))) and integer(c^.unitmsk));
end;
end.
end.

View file

@ -16,91 +16,90 @@
// | http://www.opensource.org/licenses/lgpl-license.php |
// +----------------------------------------------------------------------+
//
// $Id: nsUniversalDetector.pas,v 1.5 2008/06/22 09:04:20 ya_nick Exp $
// $Id: nsUniversalDetector.pas,v 1.7 2013/05/16 15:41:14 ya_nick Exp $
unit nsUniversalDetector;
interface
uses
{$I dbg.inc}
nsCore,
{$I dbg.inc}
nsCore,
CustomDetector;
const
NUM_OF_CHARSET_PROBERS = 4;
NUM_OF_CHARSET_PROBERS = 4;
type nsInputState = (
ePureAscii = 0,
eEscAscii = 1,
eHighbyte = 2
) ;
type
eInputState = (
isPureAscii = 0,
isEscAscii = 1,
isHighbyte = 2
);
TnsUniversalDetector = class (TObject)
protected
mInputState: nsInputState;
mDone: Boolean;
mStart: Boolean;
mGotData: Boolean;
mLastChar: Char;
mDetectedCharset: eInternalCharsetID;
mCharSetProbers: array [0..Pred(NUM_OF_CHARSET_PROBERS)] of TCustomDetector;
mEscCharSetProber: TCustomDetector;
mDetectedBOM: eBOMKind;
TnsUniversalDetector = class(TObject)
protected
mInputState: eInputState;
mDone: Boolean;
mStart: Boolean;
mGotData: Boolean;
mLastChar: AnsiChar;
mDetectedCharset: eInternalCharsetID;
mCharSetProbers: array[0..Pred(NUM_OF_CHARSET_PROBERS)] of TCustomDetector;
mEscCharSetProber: TCustomDetector;
mDetectedBOM: eBOMKind;
procedure Report(aCharsetID: eInternalCharsetID);
function CheckBOM(aBuf: pChar; aLen: integer): integer;
function GetCharsetID(CodePage: integer): eInternalCharsetID;
procedure DoEnableCharset(Charset: eInternalCharsetID; SetEnabledTo: Boolean);
public
constructor Create;
destructor Destroy; override;
procedure Report(aCharsetID: eInternalCharsetID);
function CheckBOM(aBuf: pAnsiChar; aLen: integer): integer;
function GetCharsetID(CodePage: integer): eInternalCharsetID;
procedure DoEnableCharset(Charset: eInternalCharsetID; SetEnabledTo: Boolean);
public
constructor Create;
destructor Destroy; override;
procedure Reset;
function HandleData(aBuf: PChar; aLen: integer): nsResult;
procedure DataEnd;
procedure Reset;
function HandleData(aBuf: pAnsiChar; aLen: integer): nsResult;
procedure DataEnd;
function GetDetectedCharsetInfo: nsCore.rCharsetInfo;
function GetDetectedCharsetInfo: nsCore.rCharsetInfo;
function GetKnownCharset(out KnownCharsets: pChar): integer;
procedure GetAbout(out About: rAboutHolder);
procedure DisableCharset(CodePage: integer);
function GetKnownCharset(out KnownCharsets: String): integer;
procedure GetAbout(out About: rAboutHolder);
procedure DisableCharset(CodePage: integer);
property Done: Boolean read mDone;
property BOMDetected: eBOMKind read mDetectedBOM;
end;
property Done: Boolean read mDone;
property BOMDetected: eBOMKind read mDetectedBOM;
end;
implementation
uses
SysUtils,
nsGroupProber,
nsMBCSMultiProber,
nsSBCSGroupProber,
nsEscCharsetProber,
nsLatin1Prober,
nsMBCSMultiProber,
nsSBCSGroupProber,
nsEscCharsetProber,
nsLatin1Prober,
MBUnicodeMultiProber;
const
MINIMUM_THRESHOLD: float = 0.20;
MINIMUM_THRESHOLD: float = 0.20;
AboutInfo: rAboutHolder = (
MajorVersionNr: 0;
MinorVersionNr: 2;
BuildVersionNr: 6;
About: 'Charset Detector Library. Copyright (C) 2006 - 2008, Nick Yakowlew. http://chsdet.sourceforge.net';
BuildVersionNr: 8;
About: 'Charset Detector Library. Copyright (C) 2006 - 2013, Nick Yakowlew. http://chsdet.sourceforge.net';
);
{ TnsUniversalDetector }
{ TnsUniversalDetector }
constructor TnsUniversalDetector.Create;
begin
inherited Create;
inherited Create;
mCharSetProbers[0] := TnsMBCSMultiProber.Create;
mCharSetProbers[1] := TnsSBCSGroupProber.Create;
mCharSetProbers[2] := TnsLatin1Prober.Create;
mCharSetProbers[3] := TMBUnicodeMultiProber.Create;
mEscCharSetProber := TnsEscCharSetProber.Create;
mEscCharSetProber := TnsEscCharSetProber.Create;
Reset;
end;
@ -108,7 +107,7 @@ destructor TnsUniversalDetector.Destroy;
var
i: integer;
begin
for i := 0 to Pred(NUM_OF_CHARSET_PROBERS) do
for i := 0 to Pred(NUM_OF_CHARSET_PROBERS) do
mCharSetProbers[i].Free;
mEscCharSetProber.Free;
@ -118,9 +117,9 @@ end;
procedure TnsUniversalDetector.DataEnd;
var
proberConfidence: float;
proberConfidence: float;
maxProberConfidence: float;
maxProber: PRInt32;
maxProber: int32;
i: integer;
begin
if not mGotData then
@ -135,7 +134,7 @@ begin
exit;
end;
case mInputState of
eHighbyte:
isHighbyte:
begin
maxProberConfidence := 0.0;
maxProber := 0;
@ -143,72 +142,49 @@ begin
begin
proberConfidence := mCharSetProbers[i].GetConfidence;
if proberConfidence > maxProberConfidence then
begin
maxProberConfidence := proberConfidence;
maxProber := i;
end;
begin
maxProberConfidence := proberConfidence;
maxProber := i;
end;
end;
(*do not report anything because we are not confident of it, that's in fact a negative answer*)
if maxProberConfidence > MINIMUM_THRESHOLD then
Report(mCharSetProbers[maxProber].GetDetectedCharset);
Report(mCharSetProbers[maxProber].GetDetectedCharset);
end;
eEscAscii:
begin
isEscAscii:
begin
mDetectedCharset := mEscCharSetProber.GetDetectedCharset;
end;
else
begin
mDetectedCharset := PURE_ASCII_CHARSET;
end;
end;{case}
{$ifdef DEBUG_chardet}
else
begin
mDetectedCharset := PURE_ASCII_CHARSET;
end;
end; {case}
{$IFDEF DEBUG_chardet}
AddDump('Universal detector - DataEnd');
{$endif}
{$ENDIF}
end;
function TnsUniversalDetector.HandleData(aBuf: PChar; aLen: integer): nsResult;
function TnsUniversalDetector.HandleData(aBuf: pAnsiChar; aLen: integer): nsResult;
var
i: integer;
st: eProbingState;
// startAt: integer;
//newBuf: pChar;
//BufPtr: pChar;
//b: integer;
//tmpBOM: eBOMKind;
begin
// startAt := 0;
if mDone then
begin
Result := NS_OK;
exit;
end;
if aLen > 0 then
mGotData := TRUE;
mGotData := TRUE;
(*If the data starts with BOM, it should be Unicode, but we continue check*)
(*If the data starts with BOM, we know it is Unicode*)
if mStart then
begin
mStart := FALSE;
// startAt := CheckBOM(aBuf, aLen);
CheckBOM(aBuf, aLen);
// case mDetectedBOM of
// BOM_UCS4_BE: mDetectedCharset := UCS4_BE_CHARSET;
// BOM_UCS4_LE: mDetectedCharset := UCS4_LE_CHARSET;
// BOM_UTF16_BE: mDetectedCharset := UTF16_BE_CHARSET;
// BOM_UTF16_LE: mDetectedCharset := UTF16_LE_CHARSET;
// BOM_UTF8: mDetectedCharset := UTF8_CHARSET;
//
// BOM_UCS4_2143: mDetectedCharset := UCS4_BE_CHARSET;
// BOM_UCS4_3412: mDetectedCharset := UCS4_LE_CHARSET;
// end;
// TODO - some stuppid ASCII text can starts with BOM. What to do?
if mDetectedCharset <> UNKNOWN_CHARSET then
begin
// mDone := TRUE;
// Result := NS_OK;
// exit;
end;
end; {if mStart}
end; {if mStart}
for i := 0 to Pred(aLen) do
(*other than 0xa0, if every othe character is ascii, the page is ascii*)
@ -216,100 +192,70 @@ begin
begin
(*Since many Ascii only page contains NBSP *)
(*we got a non-ascii byte (high-byte)*)
if mInputState <> eHighbyte then
if mInputState <> isHighbyte then
begin
(*adjust state*)
mInputState := eHighbyte;
mInputState := isHighbyte;
end;
end
else
begin
(*ok, just pure ascii so *)
if (mInputState = ePureAscii) and
((aBuf[i] = #$1B) or
(aBuf[i] = '{') and
(mLastChar = '~')) then
if (mInputState = isPureAscii) and
((aBuf[i] = #$1B) or
(aBuf[i] = '{') and
(mLastChar = '~')) then
(*found escape character or HZ "~{"*)
mInputState := eEscAscii;
mInputState := isEscAscii;
mLastChar := aBuf[i];
end;
case mInputState of
eEscAscii:
isEscAscii:
begin
{$ifdef DEBUG_chardet}
{$IFDEF DEBUG_chardet}
AddDump('Universal detector - Escape Detector started');
{$endif}
st := mEscCharSetProber.HandleData(aBuf,aLen);
{$ENDIF}
st := mEscCharSetProber.HandleData(aBuf, aLen);
if st = psFoundIt then
begin
mDone := TRUE;
mDetectedCharset := mEscCharSetProber.GetDetectedCharset;
end;
end;
eHighbyte:
isHighbyte:
begin
{$ifdef DEBUG_chardet}
{$IFDEF DEBUG_chardet}
AddDump('Universal detector - HighByte Detector started');
{$endif}
{$ENDIF}
for i := 0 to Pred(NUM_OF_CHARSET_PROBERS) do
begin
//newBuf := AllocMem(aLen+StartAt);
//BufPtr := newBuf;
//try
//tmpBOM := BOM_Not_Found;
//if mDetectedBOM = BOM_Not_Found then
//begin
////case mCharSetProbers[i].GetDetectedCharset of
//// UTF16_BE_CHARSET: tmpBOM := BOM_UCS4_BE;
//// UTF16_LE_CHARSET: tmpBOM := BOM_UCS4_LE;
//// else
//// tmpBOM := BOM_Not_Found;
////end;
//tmpBOM := BOM_UTF16_BE;
//end;
//for b:=0 to integer(KnownBOM[tmpBOM][0])-1 do
//begin
//BufPtr^ := KnownBOM[tmpBOM][b+1];
//inc(BufPtr);
//end;
//
//for b:=0 to aLen do
//begin
//BufPtr^ := aBuf[b];
//inc(BufPtr);
//end;
st := mCharSetProbers[i].HandleData(aBuf,aLen);
// st := mCharSetProbers[i].HandleData(newBuf,aLen+startAt);
if st = psFoundIt then
begin
mDone:= TRUE;
mDetectedCharset := mCharSetProbers[i].GetDetectedCharset;
// Result := NS_OK;
break;
end;
//finally
//FreeMem(newBuf, aLen);
//end;
end;
st := mCharSetProbers[i].HandleData(aBuf, aLen);
if st = psFoundIt then
begin
mDone := TRUE;
mDetectedCharset := mCharSetProbers[i].GetDetectedCharset;
break;
end;
end;
end;
else
else
(*pure ascii*)
begin
(*do nothing here*)
end;
end;{case}
end; {case}
Result := NS_OK;
end;
procedure TnsUniversalDetector.Report(aCharsetID: eInternalCharsetID);
begin
if (aCharsetID <> UNKNOWN_CHARSET) and
(mDetectedCharset = UNKNOWN_CHARSET) then
if (aCharsetID <> UNKNOWN_CHARSET) and
(mDetectedCharset = UNKNOWN_CHARSET) then
mDetectedCharset := aCharsetID;
mDetectedCharset := aCharsetID;
end;
procedure TnsUniversalDetector.Reset;
@ -320,11 +266,11 @@ begin
mStart := TRUE;
mDetectedCharset := UNKNOWN_CHARSET;
mGotData := FALSE;
mInputState := ePureAscii;
mLastChar := #0; (*illegal value as signal*)
mInputState := isPureAscii;
mLastChar := #0; (*illegal value as signal*)
mEscCharSetProber.Reset;
for i := 0 to Pred(NUM_OF_CHARSET_PROBERS) do
mCharSetProbers[i].Reset;
mCharSetProbers[i].Reset;
mDetectedBOM := BOM_Not_Found;
end;
@ -333,18 +279,16 @@ begin
Result := KNOWN_CHARSETS[mDetectedCharset];
end;
function TnsUniversalDetector.GetKnownCharset(out KnownCharsets: pChar): integer;
function TnsUniversalDetector.GetKnownCharset(out KnownCharsets: String): integer;
var
s: ANSIstring;
i: integer;
i: eInternalCharsetID;
begin
s := '';
for i := integer(low(KNOWN_CHARSETS)) to integer(High(KNOWN_CHARSETS)) do
s := s + #10 + KNOWN_CHARSETS[eInternalCharsetID(i)].Name +
' - ' + inttostr(KNOWN_CHARSETS[eInternalCharsetID(i)].CodePage);
KnownCharsets := '';
for i := low(KNOWN_CHARSETS) to high(KNOWN_CHARSETS) do
KnownCharsets := KnownCharsets + #10 + KNOWN_CHARSETS[i].Name +
' - ' + IntToStr(KNOWN_CHARSETS[i].CodePage);
KnownCharsets := pChar(s);
Result := Length(s);
Result := Length(KnownCharsets);
end;
procedure TnsUniversalDetector.GetAbout(out About: rAboutHolder);
@ -352,30 +296,28 @@ begin
About := AboutInfo;
end;
function TnsUniversalDetector.CheckBOM(aBuf: pChar; aLen: integer): integer;
function BOMLength(BOM: eBOMKind): integer;
begin
Result := integer(KnownBOM[BOM, 0]);
end;
function TnsUniversalDetector.CheckBOM(aBuf: pAnsiChar; aLen: integer): integer;
var
i, b: integer;
Same: Boolean;
bom: eBOMKind;
i: integer;
same: Boolean;
begin
Result := 0;
for i := integer(low(KnownBOM))+1 to integer(high(KnownBOM)) do
if aLen > BOMLength(eBOMKind(i)) then
mDetectedBOM := BOM_Not_Found;
for bom := Succ(low(eBOMKind)) to high(eBomKind) do
if aLen > KNOWN_BOM[bom].Length then
begin
Same := true;
for b := 0 to BOMLength(eBOMKind(i)) - 1 do
if (aBuf[b] <> KnownBOM[eBOMKind(i), b+1]) then
same := true;
for i := 0 to KNOWN_BOM[bom].Length - 1 do
if (aBuf[i] <> KNOWN_BOM[bom].BOM[i]) then
begin
Same := false;
same := false;
break;
end;
if Same then
if same then
begin
mDetectedBOM := eBOMKind(i);
Result := BOMLength(mDetectedBOM);
mDetectedBOM := bom;
Result := KNOWN_BOM[bom].Length;
exit;
end;
end;
@ -390,7 +332,7 @@ function TnsUniversalDetector.GetCharsetID(CodePage: integer): eInternalCharsetI
var
i: integer;
begin
for i := integer(low(KNOWN_CHARSETS))+1 to integer(high(KNOWN_CHARSETS)) do
for i := integer(low(KNOWN_CHARSETS)) + 1 to integer(high(KNOWN_CHARSETS)) do
if (KNOWN_CHARSETS[eInternalCharsetID(i)].CodePage = CodePage) then
begin
Result := eInternalCharsetID(i);
@ -423,11 +365,7 @@ begin
end;
end;
end;
end;
end.