Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.
 
 
 

966 řádky
31 KiB

  1. const UnicodeTrie = require('./');
  2. const pako = require('pako');
  3. const { swap32LE } = require('./swap');
  4. // Shift size for getting the index-1 table offset.
  5. const SHIFT_1 = 6 + 5;
  6. // Shift size for getting the index-2 table offset.
  7. const SHIFT_2 = 5;
  8. // Difference between the two shift sizes,
  9. // for getting an index-1 offset from an index-2 offset. 6=11-5
  10. const SHIFT_1_2 = SHIFT_1 - SHIFT_2;
  11. // Number of index-1 entries for the BMP. 32=0x20
  12. // This part of the index-1 table is omitted from the serialized form.
  13. const OMITTED_BMP_INDEX_1_LENGTH = 0x10000 >> SHIFT_1;
  14. // Number of code points per index-1 table entry. 2048=0x800
  15. const CP_PER_INDEX_1_ENTRY = 1 << SHIFT_1;
  16. // Number of entries in an index-2 block. 64=0x40
  17. const INDEX_2_BLOCK_LENGTH = 1 << SHIFT_1_2;
  18. // Mask for getting the lower bits for the in-index-2-block offset. */
  19. const INDEX_2_MASK = INDEX_2_BLOCK_LENGTH - 1;
  20. // Number of entries in a data block. 32=0x20
  21. const DATA_BLOCK_LENGTH = 1 << SHIFT_2;
  22. // Mask for getting the lower bits for the in-data-block offset.
  23. const DATA_MASK = DATA_BLOCK_LENGTH - 1;
  24. // Shift size for shifting left the index array values.
  25. // Increases possible data size with 16-bit index values at the cost
  26. // of compactability.
  27. // This requires data blocks to be aligned by DATA_GRANULARITY.
  28. const INDEX_SHIFT = 2;
  29. // The alignment size of a data block. Also the granularity for compaction.
  30. const DATA_GRANULARITY = 1 << INDEX_SHIFT;
  31. // The BMP part of the index-2 table is fixed and linear and starts at offset 0.
  32. // Length=2048=0x800=0x10000>>SHIFT_2.
  33. const INDEX_2_OFFSET = 0;
  34. // The part of the index-2 table for U+D800..U+DBFF stores values for
  35. // lead surrogate code _units_ not code _points_.
  36. // Values for lead surrogate code _points_ are indexed with this portion of the table.
  37. // Length=32=0x20=0x400>>SHIFT_2. (There are 1024=0x400 lead surrogates.)
  38. const LSCP_INDEX_2_OFFSET = 0x10000 >> SHIFT_2;
  39. const LSCP_INDEX_2_LENGTH = 0x400 >> SHIFT_2;
  40. // Count the lengths of both BMP pieces. 2080=0x820
  41. const INDEX_2_BMP_LENGTH = LSCP_INDEX_2_OFFSET + LSCP_INDEX_2_LENGTH;
  42. // The 2-byte UTF-8 version of the index-2 table follows at offset 2080=0x820.
  43. // Length 32=0x20 for lead bytes C0..DF, regardless of SHIFT_2.
  44. const UTF8_2B_INDEX_2_OFFSET = INDEX_2_BMP_LENGTH;
  45. const UTF8_2B_INDEX_2_LENGTH = 0x800 >> 6; // U+0800 is the first code point after 2-byte UTF-8
  46. // The index-1 table, only used for supplementary code points, at offset 2112=0x840.
  47. // Variable length, for code points up to highStart, where the last single-value range starts.
  48. // Maximum length 512=0x200=0x100000>>SHIFT_1.
  49. // (For 0x100000 supplementary code points U+10000..U+10ffff.)
  50. //
  51. // The part of the index-2 table for supplementary code points starts
  52. // after this index-1 table.
  53. //
  54. // Both the index-1 table and the following part of the index-2 table
  55. // are omitted completely if there is only BMP data.
  56. const INDEX_1_OFFSET = UTF8_2B_INDEX_2_OFFSET + UTF8_2B_INDEX_2_LENGTH;
  57. const MAX_INDEX_1_LENGTH = 0x100000 >> SHIFT_1;
  58. // The illegal-UTF-8 data block follows the ASCII block, at offset 128=0x80.
  59. // Used with linear access for single bytes 0..0xbf for simple error handling.
  60. // Length 64=0x40, not DATA_BLOCK_LENGTH.
  61. const BAD_UTF8_DATA_OFFSET = 0x80;
  62. // The start of non-linear-ASCII data blocks, at offset 192=0xc0.
  63. // !!!!
  64. const DATA_START_OFFSET = 0xc0;
  65. // The null data block.
  66. // Length 64=0x40 even if DATA_BLOCK_LENGTH is smaller,
  67. // to work with 6-bit trail bytes from 2-byte UTF-8.
  68. const DATA_NULL_OFFSET = DATA_START_OFFSET;
  69. // The start of allocated data blocks.
  70. const NEW_DATA_START_OFFSET = DATA_NULL_OFFSET + 0x40;
  71. // The start of data blocks for U+0800 and above.
  72. // Below, compaction uses a block length of 64 for 2-byte UTF-8.
  73. // From here on, compaction uses DATA_BLOCK_LENGTH.
  74. // Data values for 0x780 code points beyond ASCII.
  75. const DATA_0800_OFFSET = NEW_DATA_START_OFFSET + 0x780;
  76. // Start with allocation of 16k data entries. */
  77. const INITIAL_DATA_LENGTH = 1 << 14;
  78. // Grow about 8x each time.
  79. const MEDIUM_DATA_LENGTH = 1 << 17;
  80. // Maximum length of the runtime data array.
  81. // Limited by 16-bit index values that are left-shifted by INDEX_SHIFT,
  82. // and by uint16_t UTrie2Header.shiftedDataLength.
  83. const MAX_DATA_LENGTH_RUNTIME = 0xffff << INDEX_SHIFT;
  84. const INDEX_1_LENGTH = 0x110000 >> SHIFT_1;
  85. // Maximum length of the build-time data array.
  86. // One entry per 0x110000 code points, plus the illegal-UTF-8 block and the null block,
  87. // plus values for the 0x400 surrogate code units.
  88. const MAX_DATA_LENGTH_BUILDTIME = 0x110000 + 0x40 + 0x40 + 0x400;
  89. // At build time, leave a gap in the index-2 table,
  90. // at least as long as the maximum lengths of the 2-byte UTF-8 index-2 table
  91. // and the supplementary index-1 table.
  92. // Round up to INDEX_2_BLOCK_LENGTH for proper compacting.
  93. const INDEX_GAP_OFFSET = INDEX_2_BMP_LENGTH;
  94. const INDEX_GAP_LENGTH = ((UTF8_2B_INDEX_2_LENGTH + MAX_INDEX_1_LENGTH) + INDEX_2_MASK) & ~INDEX_2_MASK;
  95. // Maximum length of the build-time index-2 array.
  96. // Maximum number of Unicode code points (0x110000) shifted right by SHIFT_2,
  97. // plus the part of the index-2 table for lead surrogate code points,
  98. // plus the build-time index gap,
  99. // plus the null index-2 block.)
  100. const MAX_INDEX_2_LENGTH = (0x110000 >> SHIFT_2) + LSCP_INDEX_2_LENGTH + INDEX_GAP_LENGTH + INDEX_2_BLOCK_LENGTH;
  101. // The null index-2 block, following the gap in the index-2 table.
  102. const INDEX_2_NULL_OFFSET = INDEX_GAP_OFFSET + INDEX_GAP_LENGTH;
  103. // The start of allocated index-2 blocks.
  104. const INDEX_2_START_OFFSET = INDEX_2_NULL_OFFSET + INDEX_2_BLOCK_LENGTH;
  105. // Maximum length of the runtime index array.
  106. // Limited by its own 16-bit index values, and by uint16_t UTrie2Header.indexLength.
  107. // (The actual maximum length is lower,
  108. // (0x110000>>SHIFT_2)+UTF8_2B_INDEX_2_LENGTH+MAX_INDEX_1_LENGTH.)
  109. const MAX_INDEX_LENGTH = 0xffff;
  110. const equal_int = (a, s, t, length) => {
  111. for (let i = 0; i < length; i++) {
  112. if (a[s + i] !== a[t + i]) {
  113. return false;
  114. }
  115. }
  116. return true;
  117. };
  118. class UnicodeTrieBuilder {
  119. constructor(initialValue, errorValue) {
  120. let i, j;
  121. if (initialValue == null) {
  122. initialValue = 0;
  123. }
  124. this.initialValue = initialValue;
  125. if (errorValue == null) {
  126. errorValue = 0;
  127. }
  128. this.errorValue = errorValue;
  129. this.index1 = new Int32Array(INDEX_1_LENGTH);
  130. this.index2 = new Int32Array(MAX_INDEX_2_LENGTH);
  131. this.highStart = 0x110000;
  132. this.data = new Uint32Array(INITIAL_DATA_LENGTH);
  133. this.dataCapacity = INITIAL_DATA_LENGTH;
  134. this.firstFreeBlock = 0;
  135. this.isCompacted = false;
  136. // Multi-purpose per-data-block table.
  137. //
  138. // Before compacting:
  139. //
  140. // Per-data-block reference counters/free-block list.
  141. // 0: unused
  142. // >0: reference counter (number of index-2 entries pointing here)
  143. // <0: next free data block in free-block list
  144. //
  145. // While compacting:
  146. //
  147. // Map of adjusted indexes, used in compactData() and compactIndex2().
  148. // Maps from original indexes to new ones.
  149. this.map = new Int32Array(MAX_DATA_LENGTH_BUILDTIME >> SHIFT_2);
  150. for (i = 0; i < 0x80; i++) {
  151. this.data[i] = this.initialValue;
  152. }
  153. for (i = i; i < 0xc0; i++) {
  154. this.data[i] = this.errorValue;
  155. }
  156. for (i = DATA_NULL_OFFSET; i < NEW_DATA_START_OFFSET; i++) {
  157. this.data[i] = this.initialValue;
  158. }
  159. this.dataNullOffset = DATA_NULL_OFFSET;
  160. this.dataLength = NEW_DATA_START_OFFSET;
  161. // set the index-2 indexes for the 2=0x80>>SHIFT_2 ASCII data blocks
  162. i = 0;
  163. for (j = 0; j < 0x80; j += DATA_BLOCK_LENGTH) {
  164. this.index2[i] = j;
  165. this.map[i++] = 1;
  166. }
  167. // reference counts for the bad-UTF-8-data block
  168. for (j = j; j < 0xc0; j += DATA_BLOCK_LENGTH) {
  169. this.map[i++] = 0;
  170. }
  171. // Reference counts for the null data block: all blocks except for the ASCII blocks.
  172. // Plus 1 so that we don't drop this block during compaction.
  173. // Plus as many as needed for lead surrogate code points.
  174. // i==newTrie->dataNullOffset
  175. this.map[i++] = ((0x110000 >> SHIFT_2) - (0x80 >> SHIFT_2)) + 1 + LSCP_INDEX_2_LENGTH;
  176. j += DATA_BLOCK_LENGTH;
  177. for (j = j; j < NEW_DATA_START_OFFSET; j += DATA_BLOCK_LENGTH) {
  178. this.map[i++] = 0;
  179. }
  180. // set the remaining indexes in the BMP index-2 block
  181. // to the null data block
  182. for (i = 0x80 >> SHIFT_2; i < INDEX_2_BMP_LENGTH; i++) {
  183. this.index2[i] = DATA_NULL_OFFSET;
  184. }
  185. // Fill the index gap with impossible values so that compaction
  186. // does not overlap other index-2 blocks with the gap.
  187. for (i = 0; i < INDEX_GAP_LENGTH; i++) {
  188. this.index2[INDEX_GAP_OFFSET + i] = -1;
  189. }
  190. // set the indexes in the null index-2 block
  191. for (i = 0; i < INDEX_2_BLOCK_LENGTH; i++) {
  192. this.index2[INDEX_2_NULL_OFFSET + i] = DATA_NULL_OFFSET;
  193. }
  194. this.index2NullOffset = INDEX_2_NULL_OFFSET;
  195. this.index2Length = INDEX_2_START_OFFSET;
  196. // set the index-1 indexes for the linear index-2 block
  197. j = 0;
  198. for (i = 0; i < OMITTED_BMP_INDEX_1_LENGTH; i++) {
  199. this.index1[i] = j;
  200. j += INDEX_2_BLOCK_LENGTH;
  201. }
  202. // set the remaining index-1 indexes to the null index-2 block
  203. for (i = i; i < INDEX_1_LENGTH; i++) {
  204. this.index1[i] = INDEX_2_NULL_OFFSET;
  205. }
  206. // Preallocate and reset data for U+0080..U+07ff,
  207. // for 2-byte UTF-8 which will be compacted in 64-blocks
  208. // even if DATA_BLOCK_LENGTH is smaller.
  209. for (i = 0x80; i < 0x800; i += DATA_BLOCK_LENGTH) {
  210. this.set(i, this.initialValue);
  211. }
  212. }
  213. set(codePoint, value) {
  214. if ((codePoint < 0) || (codePoint > 0x10ffff)) {
  215. throw new Error('Invalid code point');
  216. }
  217. if (this.isCompacted) {
  218. throw new Error('Already compacted');
  219. }
  220. const block = this._getDataBlock(codePoint, true);
  221. this.data[block + (codePoint & DATA_MASK)] = value;
  222. return this;
  223. }
  224. setRange(start, end, value, overwrite) {
  225. let block, repeatBlock;
  226. if (overwrite == null) {
  227. overwrite = true;
  228. }
  229. if ((start > 0x10ffff) || (end > 0x10ffff) || (start > end)) {
  230. throw new Error('Invalid code point');
  231. }
  232. if (this.isCompacted) {
  233. throw new Error('Already compacted');
  234. }
  235. if (!overwrite && (value === this.initialValue)) {
  236. return this; // nothing to do
  237. }
  238. let limit = end + 1;
  239. if ((start & DATA_MASK) !== 0) {
  240. // set partial block at [start..following block boundary
  241. block = this._getDataBlock(start, true);
  242. const nextStart = (start + DATA_BLOCK_LENGTH) & ~DATA_MASK;
  243. if (nextStart <= limit) {
  244. this._fillBlock(block, start & DATA_MASK, DATA_BLOCK_LENGTH, value, this.initialValue, overwrite);
  245. start = nextStart;
  246. } else {
  247. this._fillBlock(block, start & DATA_MASK, limit & DATA_MASK, value, this.initialValue, overwrite);
  248. return this;
  249. }
  250. }
  251. // number of positions in the last, partial block
  252. const rest = limit & DATA_MASK;
  253. // round down limit to a block boundary
  254. limit &= ~DATA_MASK;
  255. // iterate over all-value blocks
  256. if (value === this.initialValue) {
  257. repeatBlock = this.dataNullOffset;
  258. } else {
  259. repeatBlock = -1;
  260. }
  261. while (start < limit) {
  262. let setRepeatBlock = false;
  263. if ((value === this.initialValue) && this._isInNullBlock(start, true)) {
  264. start += DATA_BLOCK_LENGTH; // nothing to do
  265. continue;
  266. }
  267. // get index value
  268. let i2 = this._getIndex2Block(start, true);
  269. i2 += (start >> SHIFT_2) & INDEX_2_MASK;
  270. block = this.index2[i2];
  271. if (this._isWritableBlock(block)) {
  272. // already allocated
  273. if (overwrite && (block >= DATA_0800_OFFSET)) {
  274. // We overwrite all values, and it's not a
  275. // protected (ASCII-linear or 2-byte UTF-8) block:
  276. // replace with the repeatBlock.
  277. setRepeatBlock = true;
  278. } else {
  279. // protected block: just write the values into this block
  280. this._fillBlock(block, 0, DATA_BLOCK_LENGTH, value, this.initialValue, overwrite);
  281. }
  282. } else if ((this.data[block] !== value) && (overwrite || (block === this.dataNullOffset))) {
  283. // Set the repeatBlock instead of the null block or previous repeat block:
  284. //
  285. // If !isWritableBlock() then all entries in the block have the same value
  286. // because it's the null block or a range block (the repeatBlock from a previous
  287. // call to utrie2_setRange32()).
  288. // No other blocks are used multiple times before compacting.
  289. //
  290. // The null block is the only non-writable block with the initialValue because
  291. // of the repeatBlock initialization above. (If value==initialValue, then
  292. // the repeatBlock will be the null data block.)
  293. //
  294. // We set our repeatBlock if the desired value differs from the block's value,
  295. // and if we overwrite any data or if the data is all initial values
  296. // (which is the same as the block being the null block, see above).
  297. setRepeatBlock = true;
  298. }
  299. if (setRepeatBlock) {
  300. if (repeatBlock >= 0) {
  301. this._setIndex2Entry(i2, repeatBlock);
  302. } else {
  303. // create and set and fill the repeatBlock
  304. repeatBlock = this._getDataBlock(start, true);
  305. this._writeBlock(repeatBlock, value);
  306. }
  307. }
  308. start += DATA_BLOCK_LENGTH;
  309. }
  310. if (rest > 0) {
  311. // set partial block at [last block boundary..limit
  312. block = this._getDataBlock(start, true);
  313. this._fillBlock(block, 0, rest, value, this.initialValue, overwrite);
  314. }
  315. return this;
  316. }
  317. get(c, fromLSCP) {
  318. let i2;
  319. if (fromLSCP == null) {
  320. fromLSCP = true;
  321. }
  322. if ((c < 0) || (c > 0x10ffff)) {
  323. return this.errorValue;
  324. }
  325. if ((c >= this.highStart) && (!((c >= 0xd800) && (c < 0xdc00)) || fromLSCP)) {
  326. return this.data[this.dataLength - DATA_GRANULARITY];
  327. }
  328. if (((c >= 0xd800) && (c < 0xdc00)) && fromLSCP) {
  329. i2 = (LSCP_INDEX_2_OFFSET - (0xd800 >> SHIFT_2)) + (c >> SHIFT_2);
  330. } else {
  331. i2 = this.index1[c >> SHIFT_1] + ((c >> SHIFT_2) & INDEX_2_MASK);
  332. }
  333. const block = this.index2[i2];
  334. return this.data[block + (c & DATA_MASK)];
  335. }
  336. _isInNullBlock(c, forLSCP) {
  337. let i2;
  338. if (((c & 0xfffffc00) === 0xd800) && forLSCP) {
  339. i2 = (LSCP_INDEX_2_OFFSET - (0xd800 >> SHIFT_2)) + (c >> SHIFT_2);
  340. } else {
  341. i2 = this.index1[c >> SHIFT_1] + ((c >> SHIFT_2) & INDEX_2_MASK);
  342. }
  343. const block = this.index2[i2];
  344. return block === this.dataNullOffset;
  345. }
  346. _allocIndex2Block() {
  347. const newBlock = this.index2Length;
  348. const newTop = newBlock + INDEX_2_BLOCK_LENGTH;
  349. if (newTop > this.index2.length) {
  350. // Should never occur.
  351. // Either MAX_BUILD_TIME_INDEX_LENGTH is incorrect,
  352. // or the code writes more values than should be possible.
  353. throw new Error("Internal error in Trie2 creation.");
  354. }
  355. this.index2Length = newTop;
  356. this.index2.set(this.index2.subarray(this.index2NullOffset, this.index2NullOffset + INDEX_2_BLOCK_LENGTH), newBlock);
  357. return newBlock;
  358. }
  359. _getIndex2Block(c, forLSCP) {
  360. if ((c >= 0xd800) && (c < 0xdc00) && forLSCP) {
  361. return LSCP_INDEX_2_OFFSET;
  362. }
  363. const i1 = c >> SHIFT_1;
  364. let i2 = this.index1[i1];
  365. if (i2 === this.index2NullOffset) {
  366. i2 = this._allocIndex2Block();
  367. this.index1[i1] = i2;
  368. }
  369. return i2;
  370. }
  371. _isWritableBlock(block) {
  372. return (block !== this.dataNullOffset) && (this.map[block >> SHIFT_2] === 1);
  373. }
  374. _allocDataBlock(copyBlock) {
  375. let newBlock;
  376. if (this.firstFreeBlock !== 0) {
  377. // get the first free block
  378. newBlock = this.firstFreeBlock;
  379. this.firstFreeBlock = -this.map[newBlock >> SHIFT_2];
  380. } else {
  381. // get a new block from the high end
  382. newBlock = this.dataLength;
  383. const newTop = newBlock + DATA_BLOCK_LENGTH;
  384. if (newTop > this.dataCapacity) {
  385. // out of memory in the data array
  386. let capacity;
  387. if (this.dataCapacity < MEDIUM_DATA_LENGTH) {
  388. capacity = MEDIUM_DATA_LENGTH;
  389. } else if (this.dataCapacity < MAX_DATA_LENGTH_BUILDTIME) {
  390. capacity = MAX_DATA_LENGTH_BUILDTIME;
  391. } else {
  392. // Should never occur.
  393. // Either MAX_DATA_LENGTH_BUILDTIME is incorrect,
  394. // or the code writes more values than should be possible.
  395. throw new Error("Internal error in Trie2 creation.");
  396. }
  397. const newData = new Uint32Array(capacity);
  398. newData.set(this.data.subarray(0, this.dataLength));
  399. this.data = newData;
  400. this.dataCapacity = capacity;
  401. }
  402. this.dataLength = newTop;
  403. }
  404. this.data.set(this.data.subarray(copyBlock, copyBlock + DATA_BLOCK_LENGTH), newBlock);
  405. this.map[newBlock >> SHIFT_2] = 0;
  406. return newBlock;
  407. }
  408. _releaseDataBlock(block) {
  409. // put this block at the front of the free-block chain
  410. this.map[block >> SHIFT_2] = -this.firstFreeBlock;
  411. this.firstFreeBlock = block;
  412. }
  413. _setIndex2Entry(i2, block) {
  414. ++this.map[block >> SHIFT_2]; // increment first, in case block == oldBlock!
  415. const oldBlock = this.index2[i2];
  416. if (--this.map[oldBlock >> SHIFT_2] === 0) {
  417. this._releaseDataBlock(oldBlock);
  418. }
  419. this.index2[i2] = block;
  420. }
  421. _getDataBlock(c, forLSCP) {
  422. let i2 = this._getIndex2Block(c, forLSCP);
  423. i2 += (c >> SHIFT_2) & INDEX_2_MASK;
  424. const oldBlock = this.index2[i2];
  425. if (this._isWritableBlock(oldBlock)) {
  426. return oldBlock;
  427. }
  428. // allocate a new data block
  429. const newBlock = this._allocDataBlock(oldBlock);
  430. this._setIndex2Entry(i2, newBlock);
  431. return newBlock;
  432. }
  433. _fillBlock(block, start, limit, value, initialValue, overwrite) {
  434. let i;
  435. if (overwrite) {
  436. for (i = block + start; i < block + limit; i++) {
  437. this.data[i] = value;
  438. }
  439. } else {
  440. for (i = block + start; i < block + limit; i++) {
  441. if (this.data[i] === initialValue) {
  442. this.data[i] = value;
  443. }
  444. }
  445. }
  446. }
  447. _writeBlock(block, value) {
  448. const limit = block + DATA_BLOCK_LENGTH;
  449. while (block < limit) {
  450. this.data[block++] = value;
  451. }
  452. }
  453. _findHighStart(highValue) {
  454. let prevBlock, prevI2Block;
  455. const data32 = this.data;
  456. const { initialValue } = this;
  457. const { index2NullOffset } = this;
  458. const nullBlock = this.dataNullOffset;
  459. // set variables for previous range
  460. if (highValue === initialValue) {
  461. prevI2Block = index2NullOffset;
  462. prevBlock = nullBlock;
  463. } else {
  464. prevI2Block = -1;
  465. prevBlock = -1;
  466. }
  467. const prev = 0x110000;
  468. // enumerate index-2 blocks
  469. let i1 = INDEX_1_LENGTH;
  470. let c = prev;
  471. while (c > 0) {
  472. const i2Block = this.index1[--i1];
  473. if (i2Block === prevI2Block) {
  474. // the index-2 block is the same as the previous one, and filled with highValue
  475. c -= CP_PER_INDEX_1_ENTRY;
  476. continue;
  477. }
  478. prevI2Block = i2Block;
  479. if (i2Block === index2NullOffset) {
  480. // this is the null index-2 block
  481. if (highValue !== initialValue) {
  482. return c;
  483. }
  484. c -= CP_PER_INDEX_1_ENTRY;
  485. } else {
  486. // enumerate data blocks for one index-2 block
  487. let i2 = INDEX_2_BLOCK_LENGTH;
  488. while (i2 > 0) {
  489. const block = this.index2[i2Block + --i2];
  490. if (block === prevBlock) {
  491. // the block is the same as the previous one, and filled with highValue
  492. c -= DATA_BLOCK_LENGTH;
  493. continue;
  494. }
  495. prevBlock = block;
  496. if (block === nullBlock) {
  497. // this is the null data block
  498. if (highValue !== initialValue) {
  499. return c;
  500. }
  501. c -= DATA_BLOCK_LENGTH;
  502. } else {
  503. let j = DATA_BLOCK_LENGTH;
  504. while (j > 0) {
  505. const value = data32[block + --j];
  506. if (value !== highValue) {
  507. return c;
  508. }
  509. --c;
  510. }
  511. }
  512. }
  513. }
  514. }
  515. // deliver last range
  516. return 0;
  517. }
  518. _findSameDataBlock(dataLength, otherBlock, blockLength) {
  519. // ensure that we do not even partially get past dataLength
  520. dataLength -= blockLength;
  521. let block = 0;
  522. while (block <= dataLength) {
  523. if (equal_int(this.data, block, otherBlock, blockLength)) {
  524. return block;
  525. }
  526. block += DATA_GRANULARITY;
  527. }
  528. return -1;
  529. }
  530. _findSameIndex2Block(index2Length, otherBlock) {
  531. // ensure that we do not even partially get past index2Length
  532. index2Length -= INDEX_2_BLOCK_LENGTH;
  533. for (let block = 0; block <= index2Length; block++) {
  534. if (equal_int(this.index2, block, otherBlock, INDEX_2_BLOCK_LENGTH)) {
  535. return block;
  536. }
  537. }
  538. return -1;
  539. }
  540. _compactData() {
  541. // do not compact linear-ASCII data
  542. let newStart = DATA_START_OFFSET;
  543. let start = 0;
  544. let i = 0;
  545. while (start < newStart) {
  546. this.map[i++] = start;
  547. start += DATA_BLOCK_LENGTH;
  548. }
  549. // Start with a block length of 64 for 2-byte UTF-8,
  550. // then switch to DATA_BLOCK_LENGTH.
  551. let blockLength = 64;
  552. let blockCount = blockLength >> SHIFT_2;
  553. start = newStart;
  554. while (start < this.dataLength) {
  555. // start: index of first entry of current block
  556. // newStart: index where the current block is to be moved
  557. // (right after current end of already-compacted data)
  558. var mapIndex, movedStart;
  559. if (start === DATA_0800_OFFSET) {
  560. blockLength = DATA_BLOCK_LENGTH;
  561. blockCount = 1;
  562. }
  563. // skip blocks that are not used
  564. if (this.map[start >> SHIFT_2] <= 0) {
  565. // advance start to the next block
  566. start += blockLength;
  567. // leave newStart with the previous block!
  568. continue;
  569. }
  570. // search for an identical block
  571. if ((movedStart = this._findSameDataBlock(newStart, start, blockLength)) >= 0) {
  572. // found an identical block, set the other block's index value for the current block
  573. mapIndex = start >> SHIFT_2;
  574. for (i = blockCount; i > 0; i--) {
  575. this.map[mapIndex++] = movedStart;
  576. movedStart += DATA_BLOCK_LENGTH;
  577. }
  578. // advance start to the next block
  579. start += blockLength;
  580. // leave newStart with the previous block!
  581. continue;
  582. }
  583. // see if the beginning of this block can be overlapped with the end of the previous block
  584. // look for maximum overlap (modulo granularity) with the previous, adjacent block
  585. let overlap = blockLength - DATA_GRANULARITY;
  586. while ((overlap > 0) && !equal_int(this.data, (newStart - overlap), start, overlap)) {
  587. overlap -= DATA_GRANULARITY;
  588. }
  589. if ((overlap > 0) || (newStart < start)) {
  590. // some overlap, or just move the whole block
  591. movedStart = newStart - overlap;
  592. mapIndex = start >> SHIFT_2;
  593. for (i = blockCount; i > 0; i--) {
  594. this.map[mapIndex++] = movedStart;
  595. movedStart += DATA_BLOCK_LENGTH;
  596. }
  597. // move the non-overlapping indexes to their new positions
  598. start += overlap;
  599. for (i = blockLength - overlap; i > 0; i--) {
  600. this.data[newStart++] = this.data[start++];
  601. }
  602. } else { // no overlap && newStart==start
  603. mapIndex = start >> SHIFT_2;
  604. for (i = blockCount; i > 0; i--) {
  605. this.map[mapIndex++] = start;
  606. start += DATA_BLOCK_LENGTH;
  607. }
  608. newStart = start;
  609. }
  610. }
  611. // now adjust the index-2 table
  612. i = 0;
  613. while (i < this.index2Length) {
  614. // Gap indexes are invalid (-1). Skip over the gap.
  615. if (i === INDEX_GAP_OFFSET) {
  616. i += INDEX_GAP_LENGTH;
  617. }
  618. this.index2[i] = this.map[this.index2[i] >> SHIFT_2];
  619. ++i;
  620. }
  621. this.dataNullOffset = this.map[this.dataNullOffset >> SHIFT_2];
  622. // ensure dataLength alignment
  623. while ((newStart & (DATA_GRANULARITY - 1)) !== 0) {
  624. this.data[newStart++] = this.initialValue;
  625. }
  626. this.dataLength = newStart;
  627. }
  628. _compactIndex2() {
  629. // do not compact linear-BMP index-2 blocks
  630. let newStart = INDEX_2_BMP_LENGTH;
  631. let start = 0;
  632. let i = 0;
  633. while (start < newStart) {
  634. this.map[i++] = start;
  635. start += INDEX_2_BLOCK_LENGTH;
  636. }
  637. // Reduce the index table gap to what will be needed at runtime.
  638. newStart += UTF8_2B_INDEX_2_LENGTH + ((this.highStart - 0x10000) >> SHIFT_1);
  639. start = INDEX_2_NULL_OFFSET;
  640. while (start < this.index2Length) {
  641. // start: index of first entry of current block
  642. // newStart: index where the current block is to be moved
  643. // (right after current end of already-compacted data)
  644. // search for an identical block
  645. var movedStart;
  646. if ((movedStart = this._findSameIndex2Block(newStart, start)) >= 0) {
  647. // found an identical block, set the other block's index value for the current block
  648. this.map[start >> SHIFT_1_2] = movedStart;
  649. // advance start to the next block
  650. start += INDEX_2_BLOCK_LENGTH;
  651. // leave newStart with the previous block!
  652. continue;
  653. }
  654. // see if the beginning of this block can be overlapped with the end of the previous block
  655. // look for maximum overlap with the previous, adjacent block
  656. let overlap = INDEX_2_BLOCK_LENGTH - 1;
  657. while ((overlap > 0) && !equal_int(this.index2, (newStart - overlap), start, overlap)) {
  658. --overlap;
  659. }
  660. if ((overlap > 0) || (newStart < start)) {
  661. // some overlap, or just move the whole block
  662. this.map[start >> SHIFT_1_2] = newStart - overlap;
  663. // move the non-overlapping indexes to their new positions
  664. start += overlap;
  665. for (i = INDEX_2_BLOCK_LENGTH - overlap; i > 0; i--) {
  666. this.index2[newStart++] = this.index2[start++];
  667. }
  668. } else { // no overlap && newStart==start
  669. this.map[start >> SHIFT_1_2] = start;
  670. start += INDEX_2_BLOCK_LENGTH;
  671. newStart = start;
  672. }
  673. }
  674. // now adjust the index-1 table
  675. for (i = 0; i < INDEX_1_LENGTH; i++) {
  676. this.index1[i] = this.map[this.index1[i] >> SHIFT_1_2];
  677. }
  678. this.index2NullOffset = this.map[this.index2NullOffset >> SHIFT_1_2];
  679. // Ensure data table alignment:
  680. // Needs to be granularity-aligned for 16-bit trie
  681. // (so that dataMove will be down-shiftable),
  682. // and 2-aligned for uint32_t data.
  683. // Arbitrary value: 0x3fffc not possible for real data.
  684. while ((newStart & ((DATA_GRANULARITY - 1) | 1)) !== 0) {
  685. this.index2[newStart++] = 0x0000ffff << INDEX_SHIFT;
  686. }
  687. this.index2Length = newStart;
  688. }
  689. _compact() {
  690. // find highStart and round it up
  691. let highValue = this.get(0x10ffff);
  692. let highStart = this._findHighStart(highValue);
  693. highStart = (highStart + (CP_PER_INDEX_1_ENTRY - 1)) & ~(CP_PER_INDEX_1_ENTRY - 1);
  694. if (highStart === 0x110000) {
  695. highValue = this.errorValue;
  696. }
  697. // Set trie->highStart only after utrie2_get32(trie, highStart).
  698. // Otherwise utrie2_get32(trie, highStart) would try to read the highValue.
  699. this.highStart = highStart;
  700. if (this.highStart < 0x110000) {
  701. // Blank out [highStart..10ffff] to release associated data blocks.
  702. const suppHighStart = this.highStart <= 0x10000 ? 0x10000 : this.highStart;
  703. this.setRange(suppHighStart, 0x10ffff, this.initialValue, true);
  704. }
  705. this._compactData();
  706. if (this.highStart > 0x10000) {
  707. this._compactIndex2();
  708. }
  709. // Store the highValue in the data array and round up the dataLength.
  710. // Must be done after compactData() because that assumes that dataLength
  711. // is a multiple of DATA_BLOCK_LENGTH.
  712. this.data[this.dataLength++] = highValue;
  713. while ((this.dataLength & (DATA_GRANULARITY - 1)) !== 0) {
  714. this.data[this.dataLength++] = this.initialValue;
  715. }
  716. this.isCompacted = true;
  717. }
  718. freeze() {
  719. let allIndexesLength, i;
  720. if (!this.isCompacted) {
  721. this._compact();
  722. }
  723. if (this.highStart <= 0x10000) {
  724. allIndexesLength = INDEX_1_OFFSET;
  725. } else {
  726. allIndexesLength = this.index2Length;
  727. }
  728. const dataMove = allIndexesLength;
  729. // are indexLength and dataLength within limits?
  730. if ((allIndexesLength > MAX_INDEX_LENGTH) || // for unshifted indexLength
  731. ((dataMove + this.dataNullOffset) > 0xffff) || // for unshifted dataNullOffset
  732. ((dataMove + DATA_0800_OFFSET) > 0xffff) || // for unshifted 2-byte UTF-8 index-2 values
  733. ((dataMove + this.dataLength) > MAX_DATA_LENGTH_RUNTIME)) { // for shiftedDataLength
  734. throw new Error("Trie data is too large.");
  735. }
  736. // calculate the sizes of, and allocate, the index and data arrays
  737. const indexLength = allIndexesLength + this.dataLength;
  738. const data = new Int32Array(indexLength);
  739. // write the index-2 array values shifted right by INDEX_SHIFT, after adding dataMove
  740. let destIdx = 0;
  741. for (i = 0; i < INDEX_2_BMP_LENGTH; i++) {
  742. data[destIdx++] = ((this.index2[i] + dataMove) >> INDEX_SHIFT);
  743. }
  744. // write UTF-8 2-byte index-2 values, not right-shifted
  745. for (i = 0; i < 0xc2 - 0xc0; i++) { // C0..C1
  746. data[destIdx++] = (dataMove + BAD_UTF8_DATA_OFFSET);
  747. }
  748. for (i = i; i < 0xe0 - 0xc0; i++) { // C2..DF
  749. data[destIdx++] = (dataMove + this.index2[i << (6 - SHIFT_2)]);
  750. }
  751. if (this.highStart > 0x10000) {
  752. const index1Length = (this.highStart - 0x10000) >> SHIFT_1;
  753. const index2Offset = INDEX_2_BMP_LENGTH + UTF8_2B_INDEX_2_LENGTH + index1Length;
  754. // write 16-bit index-1 values for supplementary code points
  755. for (i = 0; i < index1Length; i++) {
  756. data[destIdx++] = (INDEX_2_OFFSET + this.index1[i + OMITTED_BMP_INDEX_1_LENGTH]);
  757. }
  758. // write the index-2 array values for supplementary code points,
  759. // shifted right by INDEX_SHIFT, after adding dataMove
  760. for (i = 0; i < this.index2Length - index2Offset; i++) {
  761. data[destIdx++] = ((dataMove + this.index2[index2Offset + i]) >> INDEX_SHIFT);
  762. }
  763. }
  764. // write 16-bit data values
  765. for (i = 0; i < this.dataLength; i++) {
  766. data[destIdx++] = this.data[i];
  767. }
  768. const dest = new UnicodeTrie({
  769. data,
  770. highStart: this.highStart,
  771. errorValue: this.errorValue
  772. });
  773. return dest;
  774. }
  775. // Generates a Buffer containing the serialized and compressed trie.
  776. // Trie data is compressed twice using the deflate algorithm to minimize file size.
  777. // Format:
  778. // uint32_t highStart;
  779. // uint32_t errorValue;
  780. // uint32_t uncompressedDataLength;
  781. // uint8_t trieData[dataLength];
  782. toBuffer() {
  783. const trie = this.freeze();
  784. const data = new Uint8Array(trie.data.buffer);
  785. // swap bytes to little-endian
  786. swap32LE(data);
  787. let compressed = pako.deflateRaw(data);
  788. compressed = pako.deflateRaw(compressed);
  789. const buf = Buffer.alloc(compressed.length + 12);
  790. buf.writeUInt32LE(trie.highStart, 0);
  791. buf.writeUInt32LE(trie.errorValue, 4);
  792. buf.writeUInt32LE(data.length, 8);
  793. for (let i = 0; i < compressed.length; i++) {
  794. const b = compressed[i];
  795. buf[i + 12] = b;
  796. }
  797. return buf;
  798. }
  799. }
  800. module.exports = UnicodeTrieBuilder;