概述

本文列举了2个目前为止针对资讯类较为不错的开源智能抽取库，Newspaper 和 GeneralNewsExtractor（GNE）。

Newspaper 的下载功能有限，提供丰富的抽取字段。

GNE采用前端渲染的HTML进行字段抽取，且不提供下载HTML的功能。

Newspaper

GitHub：http://github.com/codelucas/newspaper

文档：https://newspaper.readthedocs.io/en/latest/

安装：python3需要 pip install newspaper3k

from newspaper import Article

if __name__ == '__main__':
    url = 'http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/'
    html = '''
    获取到的网页源码
    '''
    # 创建Article，必须给定URL,是为标准化网址准备的
    article = Article(url)
    # language 可以指定语种，更多语种查看文档
    # article = Article(url, language='zh')
    
    # 传入获取的源码
    article.download(input_html=html)
    # 解析抽取字段
    article.parse()
    # 输出字段，更多字段查看文档或源码Article的__init__方法
    print(article.title)
    print('-' * 50)
    print(article.top_image)
    print('-' * 50)
    print(article.publish_date)
    print('-' * 50)
    print(article.text)

GeneralNewsExtractor

GitHub：https://github.com/GeneralNewsExtractor/GeneralNewsExtractor

文档：https://generalnewsextractor.readthedocs.io/zh_CN/latest/

安装：pip install gne

下面代码是一个基于GNE和Selenium的字段抽取服务

#!/usr/bin/env python
# -*- coding: UTF-8 -*-

import json
import time

import requests
from flask import Flask, request
from gne import GeneralNewsExtractor
from selenium import webdriver
from selenium.common.exceptions import TimeoutException

app = Flask(__name__)


@app.route("/test", methods=['POST'])
def word():
    url_link = request.form['url_link']
    if '' != url_link:
        # 判断url开头是否是http://或者https://开头
        if not (url_link.startswith("http://", 0, 7) or url_link.startswith("https://", 0, 8)):
            if url_link.startswith("//", 0, 3):
                url_link = "http:" + url_link
            else:
                url_link = "http://" + url_link
        html_str = extract_by_selenium(url_link)
        if html_str != '':
            news_extraction_data = json.dumps(news_extraction(html_str), ensure_ascii=False)  # 字典转码
            return {
                'flag': '成功',
                'msg': '成功了',
                'data': news_extraction_data
            }
        return {
            'flag': '失败',
            'msg': '网站访问有点慢~',
            'data': ''
        }
    else:
        return {
            'flag': '失败',
            'msg': '提取失败，请反馈该链接帮助我们排查，谢谢！',
            'data': ''
        }


def extract_by_selenium(url_link):
    # 判断网页状态
    try:
        response = requests.get(url=url_link, timeout=5)
        print(response.status_code)
        if response.status_code == 200:
            # 创建配置对象
            chrome_options = webdriver.ChromeOptions()
            chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
            chrome_options.add_experimental_option('useAutomationExtension', False)
            chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
            chrome_options.add_argument(
                'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
            chrome_options.add_argument("disable-blink-features=AutomationControlled")  # 就是这一行告诉chrome去掉了webdriver痕迹
            wd = webdriver.Chrome(chrome_options=chrome_options,
                                  executable_path=r'../driver/chromedriver-101.0.4951.41.exe')

            wd.set_page_load_timeout(30)  # 页面加载超时时间
            wd.set_script_timeout(30)  # 页面js加载超时时间
            try:
                wd.get(url_link)
                time.sleep(5)
                wd.execute_script('''
                    function insert_visiability_info() {
                    function get_body() {
                        var body = document.getElementsByTagName('body')[0]
                        return body
                    }
                
                    function insert_info(element) {
                        is_visiable = element.offsetParent !== null
                        element.setAttribute('is_visiable', is_visiable)
                        if (is_visiable) {
                            react = element.getBoundingClientRect()
                            coordinate = JSON.stringify(react)
                            element.setAttribute('coordinate', coordinate)
                        }
                    }
                
                    function iter_node(node) {
                        children = node.children
                        insert_info(node)
                        if (children.length !== 0) {
                            for(const element of children) {
                                iter_node(element)
                            }
                        }
                    }
                
                    function sizes() {
                        let contentWidth = [...document.body.children].reduce(
                          (a, el) => Math.max(a, el.getBoundingClientRect().right), 0)
                          - document.body.getBoundingClientRect().x;
                
                        return {
                          windowWidth:  document.documentElement.clientWidth,
                          windowHeight: document.documentElement.clientHeight,
                          pageWidth:    Math.min(document.body.scrollWidth, contentWidth),
                          pageHeight:   document.body.scrollHeight,
                          screenWidth:  window.screen.width,
                          screenHeight: window.screen.height,
                          pageX:        document.body.getBoundingClientRect().x,
                          pageY:        document.body.getBoundingClientRect().y,
                          screenX:     -window.screenX,
                          screenY:     -window.screenY - (window.outerHeight-window.innerHeight),
                        }
                    }
                
                    function insert_page_info() {
                        page_info = sizes()
                        node = document.createElement('meta')
                        node.setAttribute('name', 'page_visiability_info')
                        node.setAttribute('page_info', JSON.stringify(page_info))
                        document.getElementsByTagName('head')[0].appendChild(node)
                    }
                
                    insert_page_info()
                    body = get_body()
                    iter_node(body)
                    }
                    insert_visiability_info()
                    
                    ''')
                html = wd.find_element_by_tag_name('html').parent.page_source
                wd.close()
                wd.quit()
                return html
            except TimeoutException:  # 报错后就强制停止加载 # 这里是js控制
                wd.execute_script('window.stop()')
                wd.close()
                wd.quit()
                return ''
        return ''
    except Exception as e:
        print("异常：", e)
        return ''


def news_extraction(html):
    return GeneralNewsExtractor().extract(html,
                                          noise_node_list=['//div[@class="right-sidebar"]',
                                                           '//div[@class="comment-list"]',
                                                           '//*[@style="display:none"]',
                                                           '//div[@class="statement"]'
                                                           ],
                                          use_visiable_info=True)


if __name__ == '__main__':
    # app.run(host='0.0.0.0', port=80)
    app.run()

爬亿爬 > 抽取

#通用抽取库

通用抽取库

https://元气码农少女酱.我爱你/c0c979fb059e/

作者

元气码农少女酱

发布于

2023年5月2日

许可协议

正则表达式 - 必知必会上一篇

XPath选择器下一篇