123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192 |
- import { Transform } from 'node:stream';
- import { DevNullStream } from './dev-null-stream.js';
- import { ParserFeedbackSimulator } from './parser-feedback-simulator.js';
- /**
- * Streaming [SAX](https://en.wikipedia.org/wiki/Simple_API_for_XML)-style HTML parser.
- * A [transform stream](https://nodejs.org/api/stream.html#stream_class_stream_transform) (which means you can pipe _through_ it, see example).
- *
- * @example
- *
- * ```js
- * const SAXParser = require('parse5-sax-parser');
- * const http = require('http');
- * const fs = require('fs');
- *
- * const file = fs.createWriteStream('/home/google.com.html');
- * const parser = new SAXParser();
- *
- * parser.on('text', text => {
- * // Handle page text content
- * ...
- * });
- *
- * http.get('http://google.com', res => {
- * // `SAXParser` is the `Transform` stream, which means you can pipe
- * // through it. So, you can analyze the page content and, e.g., save it
- * // to the file at the same time:
- * res.pipe(parser).pipe(file);
- * });
- * ```
- */
- export class SAXParser extends Transform {
- /**
- * @param options Parsing options.
- */
- constructor(options = {}) {
- super({ encoding: 'utf8', decodeStrings: false });
- this.pendingText = null;
- this.lastChunkWritten = false;
- this.stopped = false;
- this.options = {
- sourceCodeLocationInfo: false,
- ...options,
- };
- this.parserFeedbackSimulator = new ParserFeedbackSimulator(this.options, this);
- this.tokenizer = this.parserFeedbackSimulator.tokenizer;
- // NOTE: always pipe the stream to the /dev/null stream to avoid
- // the `highWaterMark` to be hit even if we don't have consumers.
- // (see: https://github.com/inikulin/parse5/issues/97#issuecomment-171940774)
- this.pipe(new DevNullStream());
- }
- //`Transform` implementation
- _transform(chunk, _encoding, callback) {
- if (typeof chunk !== 'string') {
- throw new TypeError('Parser can work only with string streams.');
- }
- callback(null, this._transformChunk(chunk));
- }
- _final(callback) {
- this.lastChunkWritten = true;
- callback(null, this._transformChunk(''));
- }
- /**
- * Stops parsing. Useful if you want the parser to stop consuming CPU time
- * once you've obtained the desired info from the input stream. Doesn't
- * prevent piping, so that data will flow through the parser as usual.
- *
- * @example
- *
- * ```js
- * const SAXParser = require('parse5-sax-parser');
- * const http = require('http');
- * const fs = require('fs');
- *
- * const file = fs.createWriteStream('google.com.html');
- * const parser = new SAXParser();
- *
- * parser.on('doctype', ({ name, publicId, systemId }) => {
- * // Process doctype info and stop parsing
- * ...
- * parser.stop();
- * });
- *
- * http.get('http://google.com', res => {
- * // Despite the fact that parser.stop() was called whole
- * // content of the page will be written to the file
- * res.pipe(parser).pipe(file);
- * });
- * ```
- */
- stop() {
- this.stopped = true;
- this.tokenizer.pause();
- }
- //Internals
- _transformChunk(chunk) {
- if (!this.stopped) {
- this.tokenizer.write(chunk, this.lastChunkWritten);
- }
- return chunk;
- }
- /** @internal */
- onCharacter({ chars, location }) {
- if (this.pendingText === null) {
- this.pendingText = { text: chars, sourceCodeLocation: location };
- }
- else {
- this.pendingText.text += chars;
- if (location && this.pendingText.sourceCodeLocation) {
- const { endLine, endCol, endOffset } = location;
- this.pendingText.sourceCodeLocation = {
- ...this.pendingText.sourceCodeLocation,
- endLine,
- endCol,
- endOffset,
- };
- }
- }
- if (this.tokenizer.preprocessor.willDropParsedChunk()) {
- this._emitPendingText();
- }
- }
- /** @internal */
- onWhitespaceCharacter(token) {
- this.onCharacter(token);
- }
- /** @internal */
- onNullCharacter(token) {
- this.onCharacter(token);
- }
- /** @internal */
- onEof() {
- this._emitPendingText();
- this.stopped = true;
- }
- /** @internal */
- onStartTag(token) {
- this._emitPendingText();
- const startTag = {
- tagName: token.tagName,
- attrs: token.attrs,
- selfClosing: token.selfClosing,
- sourceCodeLocation: token.location,
- };
- this.emitIfListenerExists('startTag', startTag);
- }
- /** @internal */
- onEndTag(token) {
- this._emitPendingText();
- const endTag = {
- tagName: token.tagName,
- sourceCodeLocation: token.location,
- };
- this.emitIfListenerExists('endTag', endTag);
- }
- /** @internal */
- onDoctype(token) {
- this._emitPendingText();
- const doctype = {
- name: token.name,
- publicId: token.publicId,
- systemId: token.systemId,
- sourceCodeLocation: token.location,
- };
- this.emitIfListenerExists('doctype', doctype);
- }
- /** @internal */
- onComment(token) {
- this._emitPendingText();
- const comment = {
- text: token.data,
- sourceCodeLocation: token.location,
- };
- this.emitIfListenerExists('comment', comment);
- }
- emitIfListenerExists(eventName, token) {
- if (this.listenerCount(eventName) === 0) {
- return false;
- }
- this._emitToken(eventName, token);
- return true;
- }
- _emitToken(eventName, token) {
- this.emit(eventName, token);
- }
- _emitPendingText() {
- if (this.pendingText !== null) {
- this.emitIfListenerExists('text', this.pendingText);
- this.pendingText = null;
- }
- }
- }
- //# sourceMappingURL=index.js.map
|