Detailed explanation of format cleaning tool based on XPath selector, pyquery and regular expression

Time:2021-3-3

1. Use XPath to clean up unnecessary tag elements and no content tags

from lxml import etree
 
def xpath_clean(self, text: str, xpath_dict: dict) -> str:
    '''
    XPath cleans up unnecessary elements
    :param text: html_content
    :param xpath_ Dict: clear target XPath
    :return: string type html_content
    '''
    remove_by_xpath = xpath_dict if xpath_dict else dict()
 
    #Items that must be removed, except in extreme cases, are generally removed
    remove_by_xpath.update({
      '_remove_2': '//iframe',
      '_remove_4': '//button',
      '_remove_5': '//form',
      '_remove_6': '//input',
      '_remove_7': '//select',
      '_remove_8': '//option',
      '_remove_9': '//textarea',
      '_remove_10': '//figure',
      '_remove_11': '//figcaption',
      '_remove_12': '//frame',
      '_remove_13': '//video',
      '_remove_14': '//script',
      '_remove_15': '//style'
    })
 
    parser = etree.HTMLParser(remove_blank_text=True, remove_comments=True)
    selector = etree.HTML(text, parser=parser)
 
    #General deletion operation, unnecessary label deletion
    for xpath in remove_by_xpath.values():
      for bad in selector.xpath(xpath):
        bad_string = etree.tostring(bad, encoding='utf-8',
                      pretty_print=True).decode()
        logger.debug(f"clean article content : {bad_string}")
        bad.getparent().remove(bad)
 
    skip_tip = "name()='img' or name()='tr' or " \
          "name()='th' or name()='tbody' or " \
          "name()='thead' or name()='table'"
    #Judge whether there is any content in all P tags, and delete those that do not
    for p in selector.xpath(f"//*[not({skip_tip})]"):
      #Skip logic
      if p.xpath(f".//*[{skip_tip}]") or \
          bool(re.sub('\s', '', p.xpath('string(.)'))):
        continue
 
      bad_p = etree.tostring(p, encoding='utf-8',
                  pretty_print=True).decode()
      logger.debug(f"clean p tag : {bad_p}")
      p.getparent().remove(p)
 
    return etree.tostring(selector, encoding='utf-8',
               pretty_print=True).decode()

2. Use pyquery to clean up the label properties, and return the processed source code and pure text

#!/usr/bin/env python
# -*-coding:utf-8-*-
 
from pyquery import PyQuery as pq
 
def pyquery_clean(self, text, url, pq_dict) -> object:
    '''
    Pyquery makes necessary processing,
    :param text:
    :param url:
    :param pq_dict:
    :return:
    '''
    #Delete PQ expression dictionary
    remove_by_pq = pq_dict if pq_dict else dict()
    #White list of label properties
    attr_white_list = ['rowspan', 'colspan']
    #Image link key
    img_key_list = ['src', 'data-echo', 'data-src', 'data-original']
    #Generating pyquery objects
    dom = pq(text)
 
    #Remove useless Tags
    for bad_tag in remove_by_pq.values():
      for bad in dom(bad_tag):
        bad_string = pq(bad).html()
        logger.debug(f"clean article content : {bad_string}")
      dom.remove(bad_tag)
 
    #Label attribute processing
    for tag in dom('*'):
      for key, value in tag.attrib.items():
        #Skip the logic and keep the rowspan and colSpan attributes of the table
        if key in attr_white_list:
          continue
        #Deal with image links, incomplete URL, complete after replacement
        if key in img_key_list:
          img_url = self.absolute_url(url, value)
          pq(tag).remove_attr(key)
          pq(tag).attr('src', img_url)
          pq(tag).attr('alt', '')
        #The alt attribute of the IMG tag is left blank
        elif key == 'alt':
          pq(tag).attr(key, '')
        #Delete all other attributes
        else:
          pq(tag).remove_attr(key)
 
    return dom.text(), dom.html()

3. Regular expression cleans up space and newline content

#!/usr/bin/env python
# -*-coding:utf-8-*-
 
import re  
 
def regular_clean(self, str1: str, str2: str):
    '''
    Regular expression processing data format
    :param str1: content
    :param str2: html_content
    : Return: returns the processed result
    '''
 
    def new_line(text):
      text = re.sub('<br\s?/?>', '<br>', text)
      text = re.sub(
        '</?a>|</?em>|</?html>|</?body>|'
        '</?head>|<[a-zA-Z]{1,10}\s?/>|'
        '</?strong>|</?blockquote>|</?b>|'
        '</?span>|</?i>|</?hr>|</?font>',
        '',
        text)
      text = re.sub('\n', '', text)
      text = re.sub('<h[1-6]>', '<p>', text)
      text = re.sub('</h[1-6]>', '</p>', text)
      text = text.replace('</p>', '</p>\n').replace('<br>', '<br/>')
      return text
 
    str1, str2 =  self.clean_ blank(str1),  self.clean_ Blank (STR2) # todo deal with blank line problem
 
    # TODO html_ Content processing 1, delete redundant tags that cannot be used and tags that affect data display 2, and handle and replace newline problems
 
    str2 = new_line(text=str2)
 
    return str1, str2

At the end, each method encapsulates the class code

#!/usr/bin/env python
# -*-coding:utf-8-*-
'''
author: szhan
date:2020-08-17
Summery: cleaning up HTML_ Content and get pure data format
'''
 
import re
from lxml import etree
from pyquery import PyQuery as pq
from urllib.parse import urlsplit, urljoin
 
from loguru import logger
 
 
class CleanArticle:
 
  def __init__(
      self,
      text: str,
      url: str = '',
      xpath_dict: dict = None,
      pq_dict: dict = None
  ):
    self.text = text
    self.url = url
    self.xpath_dict = xpath_dict or dict()
    self.pq_dict = pq_dict or dict()
 
  @staticmethod
  def absolute_url(baseurl: str, url: str) -> str:
    '''
    Supplementary URL
    :param baseurl:scheme url
    :param url: target url
    :return: complete url
    '''
    target_url = url if urlsplit(url).scheme else urljoin(baseurl, url)
    return target_url
 
  @staticmethod
  def clean_blank(text):
    '''
    Blank processing
    :param text:
    :return:
    '''
    text = text.replace('
', '').replace('\u3000', '').replace('\t', '').replace('\xa0', '')
    text = re.sub('\s{2,}', '', text)
    text = re.sub('\n{2,}', '\n', text)
    text = text.strip('\n').strip()
    return text
 
  def run(self):
    '''
    : Return: processed content, HTML_ content
    '''
    if (not bool(self.text)) or (not isinstance(self.text, str)):
      raise ValueError('html_content has a bad type value')
    #First of all, use XPath to remove spaces, as well as comments, iframe, button, form, script, style, video and other tags
    text = self.xpath_clean(self.text, self.xpath_dict)
 
    #The second step is to use pyquery to deal with specific details
    str1, str2 = self.pyquery_clean(text, self.url, self.pq_dict)
 
    #The final regularization
    content, html_content = self.regular_clean(str1, str2)
 
    return content, html_content
 
  def xpath_clean(self, text: str, xpath_dict: dict) -> str:
    '''
    XPath cleans up unnecessary elements
    :param text: html_content
    :param xpath_ Dict: clear target XPath
    :return: string type html_content
    '''
    remove_by_xpath = xpath_dict if xpath_dict else dict()
 
    #Items that must be removed, except in extreme cases, are generally removed
    remove_by_xpath.update({
      '_remove_2': '//iframe',
      '_remove_4': '//button',
      '_remove_5': '//form',
      '_remove_6': '//input',
      '_remove_7': '//select',
      '_remove_8': '//option',
      '_remove_9': '//textarea',
      '_remove_10': '//figure',
      '_remove_11': '//figcaption',
      '_remove_12': '//frame',
      '_remove_13': '//video',
      '_remove_14': '//script',
      '_remove_15': '//style'
    })
 
    parser = etree.HTMLParser(remove_blank_text=True, remove_comments=True)
    selector = etree.HTML(text, parser=parser)
 
    #General deletion operation, unnecessary label deletion
    for xpath in remove_by_xpath.values():
      for bad in selector.xpath(xpath):
        bad_string = etree.tostring(bad, encoding='utf-8',
                      pretty_print=True).decode()
        logger.debug(f"clean article content : {bad_string}")
        bad.getparent().remove(bad)
 
    skip_tip = "name()='img' or name()='tr' or " \
          "name()='th' or name()='tbody' or " \
          "name()='thead' or name()='table'"
    #Judge whether there is any content in all P tags, and delete those that do not
    for p in selector.xpath(f"//*[not({skip_tip})]"):
      #Skip logic
      if p.xpath(f".//*[{skip_tip}]") or \
          bool(re.sub('\s', '', p.xpath('string(.)'))):
        continue
 
      bad_p = etree.tostring(p, encoding='utf-8',
                  pretty_print=True).decode()
      logger.debug(f"clean p tag : {bad_p}")
      p.getparent().remove(p)
 
    return etree.tostring(selector, encoding='utf-8',
               pretty_print=True).decode()
 
  def pyquery_clean(self, text, url, pq_dict) -> object:
    '''
    Pyquery makes necessary processing,
    :param text:
    :param url:
    :param pq_dict:
    :return:
    '''
    #Delete PQ expression dictionary
    remove_by_pq = pq_dict if pq_dict else dict()
    #White list of label properties
    attr_white_list = ['rowspan', 'colspan']
    #Image link key
    img_key_list = ['src', 'data-echo', 'data-src', 'data-original']
    #Generating pyquery objects
    dom = pq(text)
 
    #Remove useless Tags
    for bad_tag in remove_by_pq.values():
      for bad in dom(bad_tag):
        bad_string = pq(bad).html()
        logger.debug(f"clean article content : {bad_string}")
      dom.remove(bad_tag)
 
    #Label attribute processing
    for tag in dom('*'):
      for key, value in tag.attrib.items():
        #Skip logic,保留表格的rowspan和colspan属性
        if key in attr_white_list:
          continue
        #Deal with image links, incomplete URL, complete after replacement
        if key in img_key_list:
          img_url = self.absolute_url(url, value)
          pq(tag).remove_attr(key)
          pq(tag).attr('src', img_url)
          pq(tag).attr('alt', '')
        #The alt attribute of the IMG tag is left blank
        elif key == 'alt':
          pq(tag).attr(key, '')
        #Delete all other attributes
        else:
          pq(tag).remove_attr(key)
 
    return dom.text(), dom.html()
 
  def regular_clean(self, str1: str, str2: str):
    '''
    Regular expression processing data format
    :param str1: content
    :param str2: html_content
    : Return: returns the processed result
    '''
 
    def new_line(text):
      text = re.sub('<br\s?/?>', '<br>', text)
      text = re.sub(
        '</?a>|</?em>|</?html>|</?body>|'
        '</?head>|<[a-zA-Z]{1,10}\s?/>|'
        '</?strong>|</?blockquote>|</?b>|'
        '</?span>|</?i>|</?hr>|</?font>',
        '',
        text)
      text = re.sub('\n', '', text)
      text = re.sub('<h[1-6]>', '<p>', text)
      text = re.sub('</h[1-6]>', '</p>', text)
      text = text.replace('</p>', '</p>\n').replace('<br>', '<br/>')
      return text
 
    str1, str2 =  self.clean_ blank(str1),  self.clean_ Blank (STR2) # todo deal with blank line problem
 
    # TODO html_ Content processing 1, delete redundant tags that cannot be used and tags that affect data display 2, and handle and replace newline problems
 
    str2 = new_line(text=str2)
 
    return str1, str2
 
if __name__ == '__main__':
  with open('html_content.html', 'r', encoding='utf-8') as f:
    lines = f.readlines()
    html = ''
    for line in lines:
      html += line
  ca = CleanArticle(text=html)
  _, html_content = ca.run()
  print(html_content)

summary

Here is the article about the detailed explanation of the format cleaning tool based on XPath selector, pyquery and regular expression. For more information about the format cleaning tool content of pyquery and regular expression, please search previous articles of developer or continue to browse the following related articles. I hope you can support developer more in the future!