index.js 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. import { Transform } from 'node:stream';
  2. import { DevNullStream } from './dev-null-stream.js';
  3. import { ParserFeedbackSimulator } from './parser-feedback-simulator.js';
  4. /**
  5. * Streaming [SAX](https://en.wikipedia.org/wiki/Simple_API_for_XML)-style HTML parser.
  6. * A [transform stream](https://nodejs.org/api/stream.html#stream_class_stream_transform) (which means you can pipe _through_ it, see example).
  7. *
  8. * @example
  9. *
  10. * ```js
  11. * const SAXParser = require('parse5-sax-parser');
  12. * const http = require('http');
  13. * const fs = require('fs');
  14. *
  15. * const file = fs.createWriteStream('/home/google.com.html');
  16. * const parser = new SAXParser();
  17. *
  18. * parser.on('text', text => {
  19. * // Handle page text content
  20. * ...
  21. * });
  22. *
  23. * http.get('http://google.com', res => {
  24. * // `SAXParser` is the `Transform` stream, which means you can pipe
  25. * // through it. So, you can analyze the page content and, e.g., save it
  26. * // to the file at the same time:
  27. * res.pipe(parser).pipe(file);
  28. * });
  29. * ```
  30. */
  31. export class SAXParser extends Transform {
  32. /**
  33. * @param options Parsing options.
  34. */
  35. constructor(options = {}) {
  36. super({ encoding: 'utf8', decodeStrings: false });
  37. this.pendingText = null;
  38. this.lastChunkWritten = false;
  39. this.stopped = false;
  40. this.options = {
  41. sourceCodeLocationInfo: false,
  42. ...options,
  43. };
  44. this.parserFeedbackSimulator = new ParserFeedbackSimulator(this.options, this);
  45. this.tokenizer = this.parserFeedbackSimulator.tokenizer;
  46. // NOTE: always pipe the stream to the /dev/null stream to avoid
  47. // the `highWaterMark` to be hit even if we don't have consumers.
  48. // (see: https://github.com/inikulin/parse5/issues/97#issuecomment-171940774)
  49. this.pipe(new DevNullStream());
  50. }
  51. //`Transform` implementation
  52. _transform(chunk, _encoding, callback) {
  53. if (typeof chunk !== 'string') {
  54. throw new TypeError('Parser can work only with string streams.');
  55. }
  56. callback(null, this._transformChunk(chunk));
  57. }
  58. _final(callback) {
  59. this.lastChunkWritten = true;
  60. callback(null, this._transformChunk(''));
  61. }
  62. /**
  63. * Stops parsing. Useful if you want the parser to stop consuming CPU time
  64. * once you've obtained the desired info from the input stream. Doesn't
  65. * prevent piping, so that data will flow through the parser as usual.
  66. *
  67. * @example
  68. *
  69. * ```js
  70. * const SAXParser = require('parse5-sax-parser');
  71. * const http = require('http');
  72. * const fs = require('fs');
  73. *
  74. * const file = fs.createWriteStream('google.com.html');
  75. * const parser = new SAXParser();
  76. *
  77. * parser.on('doctype', ({ name, publicId, systemId }) => {
  78. * // Process doctype info and stop parsing
  79. * ...
  80. * parser.stop();
  81. * });
  82. *
  83. * http.get('http://google.com', res => {
  84. * // Despite the fact that parser.stop() was called whole
  85. * // content of the page will be written to the file
  86. * res.pipe(parser).pipe(file);
  87. * });
  88. * ```
  89. */
  90. stop() {
  91. this.stopped = true;
  92. this.tokenizer.pause();
  93. }
  94. //Internals
  95. _transformChunk(chunk) {
  96. if (!this.stopped) {
  97. this.tokenizer.write(chunk, this.lastChunkWritten);
  98. }
  99. return chunk;
  100. }
  101. /** @internal */
  102. onCharacter({ chars, location }) {
  103. if (this.pendingText === null) {
  104. this.pendingText = { text: chars, sourceCodeLocation: location };
  105. }
  106. else {
  107. this.pendingText.text += chars;
  108. if (location && this.pendingText.sourceCodeLocation) {
  109. const { endLine, endCol, endOffset } = location;
  110. this.pendingText.sourceCodeLocation = {
  111. ...this.pendingText.sourceCodeLocation,
  112. endLine,
  113. endCol,
  114. endOffset,
  115. };
  116. }
  117. }
  118. if (this.tokenizer.preprocessor.willDropParsedChunk()) {
  119. this._emitPendingText();
  120. }
  121. }
  122. /** @internal */
  123. onWhitespaceCharacter(token) {
  124. this.onCharacter(token);
  125. }
  126. /** @internal */
  127. onNullCharacter(token) {
  128. this.onCharacter(token);
  129. }
  130. /** @internal */
  131. onEof() {
  132. this._emitPendingText();
  133. this.stopped = true;
  134. }
  135. /** @internal */
  136. onStartTag(token) {
  137. this._emitPendingText();
  138. const startTag = {
  139. tagName: token.tagName,
  140. attrs: token.attrs,
  141. selfClosing: token.selfClosing,
  142. sourceCodeLocation: token.location,
  143. };
  144. this.emitIfListenerExists('startTag', startTag);
  145. }
  146. /** @internal */
  147. onEndTag(token) {
  148. this._emitPendingText();
  149. const endTag = {
  150. tagName: token.tagName,
  151. sourceCodeLocation: token.location,
  152. };
  153. this.emitIfListenerExists('endTag', endTag);
  154. }
  155. /** @internal */
  156. onDoctype(token) {
  157. this._emitPendingText();
  158. const doctype = {
  159. name: token.name,
  160. publicId: token.publicId,
  161. systemId: token.systemId,
  162. sourceCodeLocation: token.location,
  163. };
  164. this.emitIfListenerExists('doctype', doctype);
  165. }
  166. /** @internal */
  167. onComment(token) {
  168. this._emitPendingText();
  169. const comment = {
  170. text: token.data,
  171. sourceCodeLocation: token.location,
  172. };
  173. this.emitIfListenerExists('comment', comment);
  174. }
  175. emitIfListenerExists(eventName, token) {
  176. if (this.listenerCount(eventName) === 0) {
  177. return false;
  178. }
  179. this._emitToken(eventName, token);
  180. return true;
  181. }
  182. _emitToken(eventName, token) {
  183. this.emit(eventName, token);
  184. }
  185. _emitPendingText() {
  186. if (this.pendingText !== null) {
  187. this.emitIfListenerExists('text', this.pendingText);
  188. this.pendingText = null;
  189. }
  190. }
  191. }
  192. //# sourceMappingURL=index.js.map