encode.js 3.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. import { getCodePoint, XML_BITSET_VALUE } from "./escape.js";
  2. import { htmlTrie } from "./generated/encode-html.js";
  3. /**
  4. * We store the characters to consider as a compact bitset for fast lookups.
  5. */
  6. const HTML_BITSET = /* #__PURE__ */ new Uint32Array([
  7. 5632, // Bits for 09,0A,0C
  8. 4227923966, // 32..63 -> 21-2D (minus space), 2E,2F,3A-3F
  9. 4160749569, // 64..95 -> 40, 5B-5F
  10. 939524097, // 96..127-> 60, 7B-7D
  11. ]);
  12. const XML_BITSET = /* #__PURE__ */ new Uint32Array([0, XML_BITSET_VALUE, 0, 0]);
  13. /**
  14. * Encodes all characters in the input using HTML entities. This includes
  15. * characters that are valid ASCII characters in HTML documents, such as `#`.
  16. *
  17. * To get a more compact output, consider using the `encodeNonAsciiHTML`
  18. * function, which will only encode characters that are not valid in HTML
  19. * documents, as well as non-ASCII characters.
  20. *
  21. * If a character has no equivalent entity, a numeric hexadecimal reference
  22. * (eg. `ü`) will be used.
  23. */
  24. export function encodeHTML(input) {
  25. return encodeHTMLTrieRe(HTML_BITSET, input);
  26. }
  27. /**
  28. * Encodes all non-ASCII characters, as well as characters not valid in HTML
  29. * documents using HTML entities. This function will not encode characters that
  30. * are valid in HTML documents, such as `#`.
  31. *
  32. * If a character has no equivalent entity, a numeric hexadecimal reference
  33. * (eg. `ü`) will be used.
  34. */
  35. export function encodeNonAsciiHTML(input) {
  36. return encodeHTMLTrieRe(XML_BITSET, input);
  37. }
  38. function encodeHTMLTrieRe(bitset, input) {
  39. let out;
  40. let last = 0; // Start of the next untouched slice.
  41. const { length } = input;
  42. for (let index = 0; index < length; index++) {
  43. const char = input.charCodeAt(index);
  44. // Skip ASCII characters that don't need encoding
  45. if (char < 0x80 && !((bitset[char >>> 5] >>> char) & 1)) {
  46. continue;
  47. }
  48. if (out === undefined)
  49. out = input.substring(0, index);
  50. else if (last !== index)
  51. out += input.substring(last, index);
  52. let node = htmlTrie.get(char);
  53. if (typeof node === "object") {
  54. if (index + 1 < length) {
  55. const nextChar = input.charCodeAt(index + 1);
  56. const value = typeof node.next === "number"
  57. ? node.next === nextChar
  58. ? node.nextValue
  59. : undefined
  60. : node.next.get(nextChar);
  61. if (value !== undefined) {
  62. out += value;
  63. index++;
  64. last = index + 1;
  65. continue;
  66. }
  67. }
  68. node = node.value;
  69. }
  70. if (node === undefined) {
  71. const cp = getCodePoint(input, index);
  72. out += `&#x${cp.toString(16)};`;
  73. if (cp !== char)
  74. index++;
  75. last = index + 1;
  76. }
  77. else {
  78. out += node;
  79. last = index + 1;
  80. }
  81. }
  82. if (out === undefined)
  83. return input;
  84. if (last < length)
  85. out += input.substr(last);
  86. return out;
  87. }
  88. //# sourceMappingURL=encode.js.map