home > webfront > SGML > html >

手搓一个HTML解释器（HTMLParser ）

Author：zhoulujun Date：2024-07-15 Hits：

使用 DOM 解析 HTML，可是使用原生的DOMParser，流行的 HTML 解析器库包括：Cheerio jsdom htmlparser2等。可以使用 innerHTML 、outerHTML、insertAdjacentHTML HTML 字符串插入到已存在的 DOM 元素中。但如果要手工实现呢？

js如何解释html文本？

使用 DOM 解析 HTML，可是使用原生的DOMParser，具体参看 https://developer.mozilla.org/zh-CN/docs/Web/API/DOMParser

const xhr = new XMLHttpRequest();
xhr.open('GET', 'https://www.example.com');
xhr.onload = function() {
  if (xhr.status === 200) {
    const doc = new DOMParser().parseFromString(xhr.responseText, 'text/html');
    const title = doc.getElementsByTagName('title')[0];
    console.log(title.textContent);
  } else {
    console.error('Error loading HTML document');
  }
};
xhr.send();

实际上，我们一般使用第三方库

流行的 HTML 解析器库包括：

Cheerio：https://github.com/cheeriojs
jsdom：https://github.com/jsdom/jsdom
htmlparser2：https://github.com/fb55/htmlparser2

当然，我们可以使用 innerHTML 、outerHTML、insertAdjacentHTML HTML 字符串插入到已存在的 DOM 元素中。

const element = document.getElementById('someElement');
element.innerHTML = '<p>Hello, World!</p>';
element.outerHTML = '<div class="newClass">New content</div>';
element.insertAdjacentHTML('beforeend', '<p>New paragraph</p>');

这些，无法防止 XSS攻击，为了防止 XSS 攻击，你需要在将解析后的 DOM 插入到页面之前对其进行净化。这通常涉及到以下步骤：

实体编码：将任何来自用户输入的文本转换为 HTML 实体，以防止它们被解析为 HTML 或 JavaScript。
属性净化：移除或限制 HTML 标签和属性，特别是那些可能引起安全问题的，比如 on* 事件处理器或 src 属性。
脚本移除：确保 <script> 和 <iframe> 等标签不会被执行。

有许多库可以帮助你完成这些任务，例如：

DOMPurify：这是一个专门设计用于清理 HTML，防止 XSS 攻击的库。
JS Html Sanitizer：另一个用于防止 XSS 的 HTML 清洗库。

如果我们手工是如何实现一个html parser 呢？

手搓HTML解析器

实现 html parser 主要分为词法分析和语法分析两步。

词法分析

词法分析需要把每一种类型的 token 识别出来，具体的类型有：

开始标签，如 <div>
结束标签，如 </div>
注释标签，如
doctype 标签，如 <!doctype html>
text，如 aaa

这是最外层的 token，开始标签内部还要分出属性，如 id="aaa" 这种。

也就是有这几种情况：

写一个 HTML Parser

第一层判断是否包含 <，如果不包含则是 text，如果包含则再判断是哪一种，如果是开始标签，还要对其内容再取属性，直到遇到 > 就重新判断。

语法分析

法分析就是对上面分出的 token 进行组装，生成 ast。

html 的 ast 的组装主要是考虑父子关系，记录当前的 parent，然后 text、children 都设置到当前 parent 上。

HTML Parser 语法分析

实现分析

首先我想到的是正则表达式

const startTagReg = /^<([a-zA-Z0-9\-]+)(?:([ ]+[a-zA-Z0-9\-]+=[^> ]+))*>/;
const endTagReg = /^<\/([a-zA-Z0-9\-]+)>/;
const commentReg = /^<!\-\-[^(-->)]*\-\->/;
const docTypeReg = /^<!doctype [^>]+>/;
const attributeReg = /^(?:[ ]+([a-zA-Z0-9\-]+=[^>]+))/;

实现：https://github.com/QuarkGluonPlasma/tiny-browser/blob/master/src/htmlParser.js

const startTagReg = /^<([a-zA-Z0-9\-]+)(?:([ ]+[a-zA-Z0-9\-]+=[^> ]+))*>/;
const attributeReg = /^(?:[ ]+([a-zA-Z0-9\-]+=[^> ]+))/;
const endTagReg = /^<\/([a-zA-Z0-9\-]+)>/;
const commentReg = /^<!\-\-[^(-->)]*\-\->/;
const docTypeReg = /^<!doctype [^>]+>/;

function parse(html, options) {
  function advance(num) {
    html = html.slice(num);
  }

  while(html){
    if(html.startsWith('<')) {
      const commentMatch = html.match(commentReg);
      if (commentMatch) {
        options.onComment({
          type: 'comment',
          value: commentMatch[0]
        })
        advance(commentMatch[0].length);
        continue;
      }

      const docTypeMatch = html.match(docTypeReg);
      if (docTypeMatch) {
        options.onDoctype({
          type: 'docType',
          value: docTypeMatch[0]
        });
        advance(docTypeMatch[0].length);
        continue;
      }

      const endTagMatch = html.match(endTagReg);
      if (endTagMatch) {
        options.onEndTag({
          type: 'tagEnd',
          value: endTagMatch[1]
        });
        advance(endTagMatch[0].length);
        continue;
      }

      const startTagMatch = html.match(startTagReg);
      if(startTagMatch) {
        options.onStartTag({
          type: 'tagStart',
          value: startTagMatch[1]
        });

        advance(startTagMatch[1].length + 1);
        let attributeMath;
        while(attributeMath = html.match(attributeReg)) {
          options.onAttribute({
            type: 'attribute',
            value: attributeMath[1]
          });
          advance(attributeMath[0].length);
        }
        advance(1);
        continue;
      }
    } else {
      let textEndIndex = html.indexOf('<');
      options.onText({
        type: 'text',
        value: html.slice(0, textEndIndex)
      });
      textEndIndex = textEndIndex === -1 ? html.length: textEndIndex;
      advance(textEndIndex);
    }
  }
}

module.exports = function htmlParser(str) {
  const ast = {
    children: []
  };
  let curParent = ast;
  let prevParent = null;
  const domTree = parse(str,{
    onComment(node) {
    },
    onStartTag(token) {
      const tag = {
        tagName: token.value,
        attributes: [],
        text: '',
        children: []
      };
      curParent.children.push(tag);
      prevParent = curParent;
      curParent = tag;
    },
    onAttribute(token) {
      const [ name, value ] = token.value.split('=');
      curParent.attributes.push({
        name,
        value: value.replace(/^['"]/, '').replace(/['"]$/, '')
      });
    },
    onEndTag(token) {
      curParent = prevParent;
    },
    onDoctype(token) {
    },
    onText(token) {
      curParent.text = token.value;
    }
  });
  return ast.children[0];
}

如果不用正则表达式呢？

class Node {
  constructor(type, content) {
    this.type = type;
    this.content = content;
  }
}

class ElementNode extends Node {
  constructor(tag, attributes, children) {
    super('element');
    this.tag = tag;
    this.attributes = attributes;
    this.children = children;
  }
}

class HTMLParser {
  constructor() {
    this.pos = 0;
    this.tokens = [];
  }

  parse(html) {
    this.pos = 0;
    this.tokens = [];

    while (this.pos < html.length) {
      if (html[this.pos] === '<') {
        this.parseTag(html);
      } else {
        this.parseText(html);
      }
    }

    // Build DOM tree from tokens
    const rootNode = this.buildDOMTree();
    return rootNode;
  }

  parseTag(html) {
    let start = this.pos;
    this.pos++;  // Skip '<'

    let tagType;
    if (html[this.pos] === '/') {
      this.pos++;  // Skip '/'
      tagType = 'end_tag';
    } else {
      tagType = 'start_tag';
    }

    let tagName = '';
    while (this.pos < html.length && html[this.pos].match(/[a-zA-Z0-9]/)) {
      tagName += html[this.pos];
      this.pos++;
    }

    let attributes = [];
    while (this.pos < html.length && html[this.pos] !== '>') {
      let { attrName, attrValue } = this.parseAttribute(html);
      if (attrName) {
        attributes.push({ name: attrName, value: attrValue });
      }
    }

    this.pos++;  // Skip '>'

    this.tokens.push({
      type: tagType,
      tag: tagName,
      attributes: attributes,
    });
  }

  parseAttribute(html) {
    let attrName = '';
    let attrValue = '';

    while (this.pos < html.length && html[this.pos].match(/\s/)) {
      this.pos++;
    }

    if (this.pos < html.length && html[this.pos] !== '>') {
      while (this.pos < html.length && html[this.pos].match(/[a-zA-Z0-9]/)) {
        attrName += html[this.pos];
        this.pos++;
      }

      while (this.pos < html.length && html[this.pos] !== '=') {
        this.pos++;
      }

      if (this.pos < html.length && html[this.pos] === '=') {
        this.pos++;  // Skip '='

        let quoteChar = html[this.pos];
        this.pos++;  // Skip quote character

        while (this.pos < html.length && html[this.pos] !== quoteChar) {
          attrValue += html[this.pos];
          this.pos++;
        }

        this.pos++;  // Skip closing quote
      }
    }

    return { attrName, attrValue };
  }

  parseText(html) {
    let start = this.pos;
    while (this.pos < html.length && html[this.pos] !== '<') {
      this.pos++;
    }
    let text = html.substring(start, this.pos);
    this.tokens.push({
      type: 'text',
      content: text,
    });
  }

  buildDOMTree() {
    let stack = [];
    let rootNode = null;

    for (let token of this.tokens) {
      if (token.type === 'start_tag') {
        let elementNode = new ElementNode(token.tag, token.attributes, []);
        if (stack.length > 0) {
          let parent = stack[stack.length - 1];
          parent.children.push(elementNode);
        } else {
          rootNode = elementNode;
        }
        stack.push(elementNode);
      } else if (token.type === 'end_tag') {
        stack.pop();
      } else if (token.type === 'text') {
        if (stack.length > 0) {
          let parent = stack[stack.length - 1];
          parent.children.push(new Node('text', token.content));
        }
      }
    }

    if (rootNode === null && stack.length > 0) {
      rootNode = stack[0];
    }

    return rootNode;
  }
}

参考文章：

人问我能不能写一个 HTML Parser？ https://cloud.tencent.com/developer/article/1842858

转载本站文章《手搓一个HTML解释器（HTMLParser ）》,
请注明出处：https://www.zhoulujun.cn/html/webfront/SGML/htmlBase/2024_0715_9170.html

相关热词搜索： HTMLParser html解释器手写 html标签解析

上一篇：Web Components从技术解析到生态应用个人心得指北

下一篇：最后一页