Detect-It-Easy/db/read
Kaens 700aa0aa0a
db/read: *decEncoding *Chars0to??
db/read:
 - renamed Chars0to32* to Chars0to1F* for reasons obvious in hindsight
2024-04-28 13:04:58 +02:00

422 lines
19 KiB
Text
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Supplemental read functions.
// Authors: unknown guy, Kaens (TG @kaens)
// Lots of legacy,
// TODO update the old scripts to use the new functions,
// and get rid of the functions themselves
const _BE = true; const _LE = false; //endianness for read_int16+
//little-endian = reversed notation (Intel, ZX Spectrum),
//big-endian = direct notation (TCP/IP, Motorola, Amiga)
//For the BitReader Object, BE is MSB and LE is LSB (intuitively)
const TOEOF = -1; //use for the size parameter in findSignature
// ---------- START OF PRE-v3.06 CODE --------------------
/**
* Read a big-endian word.
* @param {UInt} nOffset - The offset in the file.
* @returns {UShort} The word value.
* @alias Binary.readBEWord
*/
File.readBEWord = function(nOffset) {
return File.read_uint16(nOffset,_BE)
}
/**
* Read a big-endian dword.
* @param {UInt} nOffset - The offset in the file.
* @returns {UInt} The dword value.
* @alias Binary.readBEDword
*/
File.readBEDword = function(nOffset) {
return File.read_uint32(nOffset,_BE)
}
/**
* Read a word, selecting endianness.
* @param {UInt} nOffset - The offset in the file.
* @param {Bool} bBE - True for big-endian.
* @returns {UShort} The word value.
* @alias Binary.readEWord
*/
File.readEWord = function(nOffset,bBE) {
return File.read_uint16(nOffset,bBE)
}
/**
* Read a dword, selecting endianness.
* @param {UInt} nOffset - The offset in the file.
* @param {Bool} bBE - True for big-endian.
* @returns {UInt} The dword value.
* @alias Binary.readEDWord
*/
File.readEDword = function(nOffset,bBE) {
return File.read_uint16(nOffset,bBE)
}
/**
* Read a short (signed 16-bit) value.
* @param {UInt} nOffset - The offset in the file.
* @returns {Short} The short value.
* @alias Binary.readShort
*/
File.readShort = function(nOffset) {
return File.read_int16(nOffset,_LE)
}
// -------- END OF PRE-v3.06 CODE
//the encoding tables start with 0x7F, not 0x80!
const CP437 = "⌂"+
"ÇüéâäàåçêëèïîìÄÅÉæÆôöòûùÿÖÜ¢£¥₧ƒ"+
"áíóúñѪº¿⌐¬½¼¡«»░▒▓│┤╡╢╖╕╣║╗╝╜╛┐"+
"└┴┬├─┼╞╟╚╔╩╦╠═╬╧╨╤╥╙╘╒╓╫╪┘┌█▄▌▐▀"+
"αßΓπΣσµτΦΘΩδ∞φε∩≡±≥≤⌠⌡÷≈°∙·√ⁿ²■ ";
const CP1252 = "⌂"+
"€?‚ƒ„…†‡ˆ‰Š‹Œ?Ž??‘’“”•–—˜™š›œ?žŸ"+
" ¡¢£¤¥¦§¨©ª«¬?®¯°±²³´µ¶·¸¹º»¼½¾¿"+
"ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞß"+
"àáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ";
const CPAmiga = "⫽"+ // alternatively, "▒""
"абвгдежзийклмнопрстуфхцчшщъыьэюя"+ //0x80~0xA0 display Cyrillics, just to fill the void
" ¡¢£¤¥¦§¨©ª«¬–®¯°±²³´µ¶·¸¹º»¼½¾¿"+
"ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞß"+
"àáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ";
const CPRISCOS = "⌂"+
"€Ŵŵ◰﯀Ŷŷ<C5B6>⇦⇨⇩⇧…™‰•“”„Œœ†‡fifl"+
" ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿"+
"ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞß"+
"àáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ";
const CPAtariST = "⌂"+
"ÇüéâäàåçêëèïîìÄÅÉæÆôöòûùÿÖÜ¢£¥ßƒ"+
"áíóúñѪº¿⌐¬½¼¡«»ãõØøœŒÀÃÕ¨´†¶©®™"+
"ijIJאבגדהוזחטיכלמנסעפצקרשתןךםףץ§∧∞"+
"αβΓπΣσµτΦΘΩδ∮φ∈∩≡±≥≤⌠⌡÷≈°∙·√ⁿ²³¯";
const JISX0201 = "⌂"+
"→-‚ƒ„…†‡ˆ‰Š‹Œ↑޳™‘’“”•–—˜™š›œ¢žŸ"+ //decided to mix it with cp1252
"→。「」、・ヲァィゥェォャュョッーアイウエカキクケコサシスセソタ"+
"チツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワン゙゚"+
"àáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ";
const Chars0to1F = "・☺☻♥♦♣♠•◘○◙♂♀♪♫☼►◄↕‼¶§▬↨↑↓→←∟↔▲▼"; //#0 is a small dot from Japanese
const Chars0to1FLF = "・☺☻♥♦♣♠•◘○\x0A♂♀♪♫☼►◄↕‼¶§▬↨↑↓→←∟↔▲▼";
const Chars0to1FCRLF = "・☺☻♥♦♣♠•◘○\x0A♂♀\x0D♫☼►◄↕‼¶§▬↨↑↓→←∟↔▲▼";
/**
* Decode a 1-byte encoding from a byte array using the 128-byte-long table given,
* as well as a table to display the first 32 characters.
* @param {[uint8]} ansi - an array returned by readBytes.
* @param {String[0x81]} dectbl - a decoding table; just make a const here in db/read for that
* @param {bool} zstop (optional, default=true) - whether to stop reading on 0 (ASCIIZ behaviour)
* @param {Array} tbl01F (optional, default=Chars0to1FCRLF) - which table to use for the first 32 characters
* @returns {String} a string value usable with js.
*/
function decEncoding(ansi, dectbl, zstop, tbl01F) {
if(typeof zstop === 'undefined') zstop = true;
if(typeof tbl01F === 'undefined') tbl01F = Chars0to1FCRLF;
var s = "", bit8 = 0;
for(var i=0; i < ansi.length; i++) {
if (!ansi[i] && zstop) break;
else if(ansi[i] < 0x80)
switch(ansi[i]) { // 7-bit variation processing
case 0x0E: if(dectbl == "JISX0201" || dectbl == "KOI8-R") bit8 = 0x80;
else s += tbl01F[0xE]; break;
case 0x0F: if(dectbl == "JISX0201" || dectbl == "KOI8-R") bit8 = 0;
else s += tbl01F[0xF]; break;
case 0x5C: if(dectbl == "JISX0201") s += "¥"; else s += "\\"; break;
case 0x7E: if(dectbl == "JISX0201") s += "‾"; else s += "~"; break;
case 0x7F: if(dectbl != "JISX0201") s += dectbl[0];
else s += String.fromCharCode(bit8+ansi[i]); break;
default:
if(!bit8 && ansi[i] >= 0 && ansi[i] < 0x20) s += tbl01F[ansi[i]];
else s += String.fromCharCode(bit8+ansi[i]);
}
else s += dectbl[ansi[i]-0x7F];
}
return s;
}
/**
* Read a byte array from file.
* @param {UInt} ofs - the offset to start from.
* @param {Int} len - the amount of bytes to read.
* @param {Bool} zspace (optional, default=false) - replace 0 with 0x20 (space characters).
* @returns {[uint8]} The file slice. If you go beyond EoF, read_uint8 only knows what happens.
*/
function readBytes(ofs, len, zspace) { //for now; feels like this should be a system function
var c, s = []; if(typeof zspace === 'undefined') zspace = false;
for (var i=0; i < len && ofs+i < File.getSize(); i++) {
c = File.read_uint8(ofs+i);
if(zspace && !c) c = 0x20;
s.push(c);
}
return s;
}
/**
* Decode a 1-byte encoding from file using the 128-byte-long table given.
* @param {UInt} ofs - the offset to start from.
* @param {UInt} len - the amount of bytes to read.
* @param {String[0x81]} dectbl - a decoding table; just make a const here in db/read for that
* @param {bool} zstop (optional, default=true) - whether to stop reading on 0 (ASCIIZ behaviour)
* @returns {String} a string value usable with js.
*/
function decAnsi(ofs, len, dectbl, zstop) {
return(decEncoding(readBytes(ofs,len), dectbl, zstop))
}
/**
* Your typical integer division with just the integer result.
* Not tested on all weird cases, but __you know what you did.__
* @param {Int} a
* @param {Int} b
* @returns {Int} something like div
*/
function div(a, b) { var s = a*b>0 ? 1 : a*b<0 ? -1 : 0; return s*Math.floor(Math.abs(a)/Math.abs(b)) }
/**
* Derive a string hexadecimal value, zero-padded.
* @param {Int} a - the numerical value.
* @param {UInt} padz (optional,default=2) - how many characters to zero-pad.
* @returns {String} The hex value, capital letters A~F, ending with "h".
*/
function Hex(a, padz) {
if(typeof a === 'undefined') return "!Hex("+a+")";
if(typeof padz === 'undefined') padz = 2;
var minus=""; if(a<0) { a = -a; minus = "-" }
var r = a.toString(16).toUpperCase(); var pads="";
if(r.length < padz) pads = Array(1 + padz - r.length).join('0');
return minus+pads+r+"h"
}
function Bin(a, padz) {
if(typeof a === 'undefined') return "!Bin("+a+")";
if(typeof padz === 'undefined') padz = 4;
var minus = ""; if(a < 0) { a = -a; minus = "-" }
var r = a.toString(2); var pads="";
if(r.length < padz) pads = Array(1 + padz - r.length).join('0');
return minus+pads+r+"b"
}
function Oct(a, padz) {
if(typeof a === 'undefined') return "!Oct("+a+")";
if(typeof padz === 'undefined') padz = 4;
var minus = ""; if(a < 0) { a = -a; minus = "-" }
var r = a.toString(8); var pads="";
if(r.length < padz) pads = Array(1 + padz - r.length).join('0');
return minus+pads+r+"o"
}
/**
* Read a variable-length quantity, an unsigned integer like in MIDI files, from the file.
* @param {UInt} ofs - the offset to start from.
* @returns {List} [length,value] - if length (in physical bytes) = 0, the value had a problem.
**/
function readVarUInt(ofs) {
if(ofs < 0 || ofs >= File.getSize()) return [0,0];
var t = 0, wb = 1, r = 1, o = ofs;
var b = File.read_uint8(o++); t = (t << 7) | (b&0x7F);
var b_ = b; while(b_) { b_ >>= 1; wb++ }
while(r < 16 && (b&0x80)) {
b = File.read_uint8(o++); t = (t << 7) | (b&0x7F); r++
}
if(wb > 64) return [0,0xFFFFFFFFFFFFFFFF]; // sizeof(target) in bits. A 64bit value should be enough, right?
else if(b&0x80) return [0,-1]; //EOF
else return [r,t]
}
/**
* This object facilitates reading a file as a sequence of bits
* @init {UInt} [nOffset = 0] - provide the file offset
* @param {UInt} nBits - bits to read, autolimits to 32 (so read little by little!)
* @returns {UInt} read value as integer, -1 if EoF reached
* @example
* First create an instance with the file object: var bits = new BitReader(10);
* Then call the readBits method with the number of bits you want: var value = bits.read(5);
* Or put the reader towards a different place: bits.init(10)
* Receive the current bit-file offset: bits.offset
* Set the bit-file's offset, in bytes, without changing state: bits.seek(10)
* Set the bit-file's offset in bits: bits.bseek(14)
* Skip some bytes without changing state: bits.consume(2)
**/
function BitReader(nOffset, nEndian) {
this.n = 0; // the number of bits in the buffer
this.buf = 0; // the bit buffer
this.offset = nOffset ? nOffset : 0; // the file offset
this.endian = nEndian ? nEndian : _LE; // for different mechanics of bitstreaming; ogg/flac use _BE
// use this to change the pointer, which will reinit the reader, but not the logger
this.init = function(nOffset) { this.ofs = nOffset ? nOffset : 0; this.n = this.buf = 0 }
// the method to read b bits from the file
this.read = function(nBits) {
if(nBits > 64) nBits = 64; if(nBits < 0) return 0;
if(this.endian === _LE) {
while(this.n < nBits) { // while the buffer is not enough
this.buf |= File.read_uint8(this.offset++) << this.n; // read a byte and append it to the buffer
this.n += 8; // increase the bit number by 8
}
var v = this.buf & ((1 << nBits) - 1); // extract the desired bits from the buffer
this.buf >>= nBits; // shift the buffer to the right
} else {
while(this.n < nBits) {
this.buf = (this.buf << 8) | File.read_uint8(this.offset++); // shift the buffer to the left and append a byte
this.n += 8;
}
var v = this.buf >> (this.n - nBits); // extract the desired bits from the most significant part of the buffer
this.buf &= ((1<<this.n)-1) >> nBits; // clear the extracted bits from the buffer
}
this.n -= nBits; // decrease the bit number by b
return v; // return the value even if the file is exhausted
}
// Skip some bytes without changing state: bits.consume(2)
this.consume = function(nBytes) { this.offset += nBytes; }
// Set the bit-file's offset, in bytes, without changing state:
this.seek = function(nOfs) { this.offset = nOfs; }
// Set the bit-file's offset in bits:
this.bseek = function(nOfs) { this.offset = nOfs - (nOfs%8); this.buf = this.n = 0; this.read(nOfs%8); }
}
/**
* Check a file slice for being all zeroes.
* @param {UInt} ofs - the offset to start from.
* @param {Int} len - the amount of bytes to check.
* @returns {bool} True if the slice is all zeroes. If you go beyond EoF, always false.
*/
function isAllZeroes(ofs, len) {
if(ofs+len>File.getSize()) return false;
var c = 0;
for(var i=0;i < len && ofs+i < File.getSize();i++) if(!File.read_uint8(ofs+i)) c++;
return c === len;
}
/**
* If the string was too long and has been read incompletely, adds an ellipsis after the last
* complete word, to avoid cut-off words. If `space characters' are not detected,
* replaces the last character with an ellipsis.
* Mostly usable for lengthy multiline comments/messages.
* @param {String} a - the original incomplete string.
* @param {Number} trim - the buffer size; if a.length == trim, we decide it was cropped.
* @param {Number} [mintrim = 78] - don't try searching for spaces below this point.
* @returns {String} - the resulting string.
*/
function addEllipsis(a, trim, mintrim) {
if(!mintrim) mintrim = 78; if(a.length < trim || mintrim > trim) return a;
const spaces = " .,:;!\\/'\"=&\x09\x10\x13\x1A\x26。、。,,・";
var i = trim, c = 0, ci = -1;
while(i >= mintrim && c < 2) {
if(spaces.indexOf(a[i]) >= 0) { c++; while(spaces.indexOf(a[i]) >= 0) i--; if(ci < 0) ci = i+1 }
while(spaces.indexOf(a[i]) < 0) i--
}
if((i < mintrim && c < 2) //we conclude this language doesn't really have that many spaces...
|| !c) //...or none at all in the trimmable slice...
return a.slice(0,trim)+'…';
else //this language has some spaces and we can use the last one to trim
return a.slice(0,Math.max(ci),mintrim)+'…';
}
/**
* sOptions.append a string (optionally prefixed) if the space-trimmed string is not empty.
* @param {variant} a - the string to output (safe to accidentally drop a non-string in)
* @param {String} prefix (optional) - what to put in front of the output string
* @param {String} suffix (optional) - what to put after the output string
*/
function sOptionT(a, prefix, suffix) {
if (typeof prefix === 'undefined') prefix = ""; if (typeof suffix === 'undefined') suffix = "";
if ((""+a).trim() != "") sOptions = sOptions.append(prefix+(""+a).trim()+suffix)
}
/**
* sOptions.append a string (optionally prefixed) if the string is not empty.
* @param {variant} a - the string to output (safe to accidentally drop a non-string in)
* @param {String} prefix (optional) - what to put in front of the output string
* @param {String} suffix (optional) - what to put after the output string
*/
function sOption(a, prefix, suffix) {
if (typeof prefix === 'undefined') prefix = ""; if (typeof suffix === 'undefined') suffix = "";
if ((""+a).trim() != "") sOptions = sOptions.append(prefix+(""+a).trim()+suffix)
}
/**
* A more verbose (but still concise) way of outputting the calculated size(s, derived using different algorithms),
* taking into account and visualising the difference from the actual file size.
* @param {...Number} sizes - numerical values
* For example, if a file is 100 bytes long, outSz(90,100,105) will yield "90(+10)/100/105(-5!)"
* The "!)" thus indicates the file is too short compared to the algorithmic estimation.
* If some of the reported sizes match, the value will only be displayed once.
* It's still a good idea to add "/malformed!short" to the version string — it's visible without isVerbose.
*/
function outSz() { if(!arguments.length || typeof arguments[0] === 'undefined') return "?";
var sizes = [], origs = [];
for(i = 0; i < arguments.length; i++)
if(arguments[i] >= 0) if(!origs.length || origs.indexOf(arguments[i]) < 0) {
origs.push(arguments[i]);
sizes.push(
arguments[i] < File.getSize() ? arguments[i]+"(+"+(File.getSize()-arguments[i])+")"
: arguments[i] > File.getSize() ? arguments[i]+"(-"+(arguments[i]-File.getSize())+"!)"
: arguments[i]
)
}
return sizes.join("/")
}
/**
* A shorthand for the situation where you compare the file suffix to what you'd expect. Use as the option to isHeuristicScan being true.
* @param {String} a - the expected file suffix, case-insensitive, no heading period unlike Python
* @returns {bool} if a match is reached
*/
function extIs(a) { return Binary.getFileSuffix().toLowerCase() == a.toLowerCase() }
/**
* slashTag formats a string in a way that's useful when a tag has two versions (for ex. in different languages). It will either show both with "/" in between, or one of them if the other one's an empty string, or an empty string if both are empty.
* @param {String} a - the first of the two
* @param {String} b - the second of the two
* @returns {String}
*/
function slashTag(a, b) {
if(a == b) return a;
else if(a != "" && b == "")
return a;
else if(a == "" && b != "")
return b;
else if(a != "" && b != "")
return a+"/"+b;
else return ""
}
/**
* createOrderlyHuffmanTable is just for detections but it does return the table for further checks. Or it returns false.
* @param {Array} lent - the lengths table
* @param {String} btl - bit table length
* @param {BitReader} br - a BitReader object pointing somewhere at the right position for this. The provided BitReader WILL change state.
* @returns {Array or false}
*/
// createOrderlyHuffmanTable is just for detections but it does return the table for further checks. Or it returns false.
function createOrderlyHuffmanTable(lent, btl, br) {
var md = 32, Md = reall = code = 0; var _t = [], fi = [], li = [], ni = [];
for(i = 0; i < 33; i++) fi[i] = 0xFFFF;
for(i = 0; i < btl; i++) { len = lent[i]; if(len) {
if(len < md) md = len; if(len > Md) Md = len;
if(fi[len] == 0xFFFF) { fi[len] = li[len] = i } else { ni[li[len]] = i; li[len] = i } reall++ } }
if(!Md) return false;
for(d = md; d <= Md; d++) {
if(fi[d] != 0xFFFF) ni[li[d]] = btl;
for(i = fi[d]; i < btl; i = ni[i]) {
//insert HuffmanCode:
var j = 0, le = _t.length;
for(var cb = d; cb >= 0; cb--) {
var cob = (cb && ( ( (code>>(Md-d)) >> (cb-1) ) & 1 ) ) ? 1 : 0;
if(j != le) {
if(!cb || (!_t[j][0] && !_t[j][1])) return false; //[0] is left, [1] is right, [2] is value
if(!_t[j][cob]) _t[j][cob] = j = le; else j = _t[j][cob];
} else {
_t.push([ (cb&&!cob)?le+1:0, (cb&&cob)?le+1:0, cb?0:i ]);
j++; le++
} }
code += 1 << (Md-d) } }
return _t
}