import json
import time
import requests
from flask import Flask, request
from gne import GeneralNewsExtractor
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
app = Flask(__name__)
@app.route("/test", methods=['POST'])
def word():
url_link = request.form['url_link']
if '' != url_link:
if not (url_link.startswith("http://", 0, 7) or url_link.startswith("https://", 0, 8)):
if url_link.startswith("//", 0, 3):
url_link = "http:" + url_link
else:
url_link = "http://" + url_link
html_str = extract_by_selenium(url_link)
if html_str != '':
news_extraction_data = json.dumps(news_extraction(html_str), ensure_ascii=False)
return {
'flag': '成功',
'msg': '成功了',
'data': news_extraction_data
}
return {
'flag': '失败',
'msg': '网站访问有点慢~',
'data': ''
}
else:
return {
'flag': '失败',
'msg': '提取失败,请反馈该链接帮助我们排查,谢谢!',
'data': ''
}
def extract_by_selenium(url_link):
try:
response = requests.get(url=url_link, timeout=5)
print(response.status_code)
if response.status_code == 200:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
chrome_options.add_argument(
'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
chrome_options.add_argument("disable-blink-features=AutomationControlled")
wd = webdriver.Chrome(chrome_options=chrome_options,
executable_path=r'../driver/chromedriver-101.0.4951.41.exe')
wd.set_page_load_timeout(30)
wd.set_script_timeout(30)
try:
wd.get(url_link)
time.sleep(5)
wd.execute_script('''
function insert_visiability_info() {
function get_body() {
var body = document.getElementsByTagName('body')[0]
return body
}
function insert_info(element) {
is_visiable = element.offsetParent !== null
element.setAttribute('is_visiable', is_visiable)
if (is_visiable) {
react = element.getBoundingClientRect()
coordinate = JSON.stringify(react)
element.setAttribute('coordinate', coordinate)
}
}
function iter_node(node) {
children = node.children
insert_info(node)
if (children.length !== 0) {
for(const element of children) {
iter_node(element)
}
}
}
function sizes() {
let contentWidth = [...document.body.children].reduce(
(a, el) => Math.max(a, el.getBoundingClientRect().right), 0)
- document.body.getBoundingClientRect().x;
return {
windowWidth: document.documentElement.clientWidth,
windowHeight: document.documentElement.clientHeight,
pageWidth: Math.min(document.body.scrollWidth, contentWidth),
pageHeight: document.body.scrollHeight,
screenWidth: window.screen.width,
screenHeight: window.screen.height,
pageX: document.body.getBoundingClientRect().x,
pageY: document.body.getBoundingClientRect().y,
screenX: -window.screenX,
screenY: -window.screenY - (window.outerHeight-window.innerHeight),
}
}
function insert_page_info() {
page_info = sizes()
node = document.createElement('meta')
node.setAttribute('name', 'page_visiability_info')
node.setAttribute('page_info', JSON.stringify(page_info))
document.getElementsByTagName('head')[0].appendChild(node)
}
insert_page_info()
body = get_body()
iter_node(body)
}
insert_visiability_info()
''')
html = wd.find_element_by_tag_name('html').parent.page_source
wd.close()
wd.quit()
return html
except TimeoutException:
wd.execute_script('window.stop()')
wd.close()
wd.quit()
return ''
return ''
except Exception as e:
print("异常:", e)
return ''
def news_extraction(html):
return GeneralNewsExtractor().extract(html,
noise_node_list=['//div[@class="right-sidebar"]',
'//div[@class="comment-list"]',
'//*[@style="display:none"]',
'//div[@class="statement"]'
],
use_visiable_info=True)
if __name__ == '__main__':
app.run()