index.js 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. import { html } from 'parse5';
  2. import { SAXParser, } from 'parse5-sax-parser';
  3. import { escapeText, escapeAttribute } from 'entities/lib/escape.js';
  4. /**
  5. * Streaming [SAX](https://en.wikipedia.org/wiki/Simple_API_for_XML)-style HTML rewriter.
  6. * A [transform stream](https://nodejs.org/api/stream.html#stream_class_stream_transform) (which means you can pipe _through_ it, see example).
  7. *
  8. * The rewriter uses the raw source representation of tokens if they are not modified by the user. Therefore, the resulting
  9. * HTML is not affected by parser error-recovery mechanisms as in a classical parsing-serialization roundtrip.
  10. *
  11. * @example
  12. *
  13. * ```js
  14. * const RewritingStream = require('parse5-html-rewriting-stream');
  15. * const http = require('http');
  16. * const fs = require('fs');
  17. *
  18. * const file = fs.createWriteStream('/home/google.com.html');
  19. * const rewriter = new RewritingStream();
  20. *
  21. * // Replace divs with spans
  22. * rewriter.on('startTag', startTag => {
  23. * if (startTag.tagName === 'span') {
  24. * startTag.tagName = 'div';
  25. * }
  26. *
  27. * rewriter.emitStartTag(startTag);
  28. * });
  29. *
  30. * rewriter.on('endTag', endTag => {
  31. * if (endTag.tagName === 'span') {
  32. * endTag.tagName = 'div';
  33. * }
  34. *
  35. * rewriter.emitEndTag(endTag);
  36. * });
  37. *
  38. * // Wrap all text nodes with an <i> tag
  39. * rewriter.on('text', (_, raw) => {
  40. * // Use the raw representation of text without HTML entities decoding
  41. * rewriter.emitRaw(`<i>${raw}</i>`);
  42. * });
  43. *
  44. * http.get('http://google.com', res => {
  45. * // Assumes response is UTF-8.
  46. * res.setEncoding('utf8');
  47. * // `RewritingStream` is a `Transform` stream, which means you can pipe
  48. * // through it.
  49. * res.pipe(rewriter).pipe(file);
  50. * });
  51. * ```
  52. */
  53. export class RewritingStream extends SAXParser {
  54. /** Note: `sourceCodeLocationInfo` is always enabled. */
  55. constructor() {
  56. super({ sourceCodeLocationInfo: true });
  57. }
  58. _transformChunk(chunk) {
  59. // NOTE: ignore upstream return values as we want to push to
  60. // the `Writable` part of the `Transform` stream ourselves.
  61. super._transformChunk(chunk);
  62. return '';
  63. }
  64. _getRawHtml(location) {
  65. const { droppedBufferSize, html } = this.tokenizer.preprocessor;
  66. const start = location.startOffset - droppedBufferSize;
  67. const end = location.endOffset - droppedBufferSize;
  68. return html.slice(start, end);
  69. }
  70. // Events
  71. emitIfListenerExists(eventName, token) {
  72. if (!super.emitIfListenerExists(eventName, token)) {
  73. this.emitRaw(this._getRawHtml(token.sourceCodeLocation));
  74. }
  75. // NOTE: don't skip new lines after `<pre>` and other tags,
  76. // otherwise we'll have incorrect raw data.
  77. this.parserFeedbackSimulator.skipNextNewLine = false;
  78. return true;
  79. }
  80. // Emitter API
  81. _emitToken(eventName, token) {
  82. this.emit(eventName, token, this._getRawHtml(token.sourceCodeLocation));
  83. }
  84. /** Emits a serialized document type token into the output stream. */
  85. emitDoctype(token) {
  86. let res = `<!DOCTYPE ${token.name}`;
  87. if (token.publicId !== null) {
  88. res += ` PUBLIC "${token.publicId}"`;
  89. }
  90. else if (token.systemId !== null) {
  91. res += ' SYSTEM';
  92. }
  93. if (token.systemId !== null) {
  94. res += ` "${token.systemId}"`;
  95. }
  96. res += '>';
  97. this.push(res);
  98. }
  99. /** Emits a serialized start tag token into the output stream. */
  100. emitStartTag(token) {
  101. let res = `<${token.tagName}`;
  102. for (const attr of token.attrs) {
  103. res += ` ${attr.name}="${escapeAttribute(attr.value)}"`;
  104. }
  105. res += token.selfClosing ? '/>' : '>';
  106. this.push(res);
  107. }
  108. /** Emits a serialized end tag token into the output stream. */
  109. emitEndTag(token) {
  110. this.push(`</${token.tagName}>`);
  111. }
  112. /** Emits a serialized text token into the output stream. */
  113. emitText({ text }) {
  114. this.push(!this.parserFeedbackSimulator.inForeignContent &&
  115. html.hasUnescapedText(this.tokenizer.lastStartTagName, true)
  116. ? text
  117. : escapeText(text));
  118. }
  119. /** Emits a serialized comment token into the output stream. */
  120. emitComment(token) {
  121. this.push(`<!--${token.text}-->`);
  122. }
  123. /** Emits a raw HTML string into the output stream. */
  124. emitRaw(html) {
  125. this.push(html);
  126. }
  127. }
  128. //# sourceMappingURL=index.js.map