Open thread to crawl Ali complaint information in black cat

Time:2020-10-23

For learning only, please moderately open the thread

1、 Code

import requests
from requests_html import HTMLSession
import time
from concurrent.futures import ThreadPoolExecutor
import json

pool = ThreadPoolExecutor(30)
big_list = []
pool_name_list =[]
session = HTMLSession()

def dewu_company(x):

    try:

        Print (f 'page {x + 1}')

        params = {
            'couid': '1878960481',
            'type': '1',
            'page_size': f'{(x + 1) * 10}',
            'page': f'{x + 1}',
            # 'callback':'jQuery11',
        }
        url = 'https://tousu.sina.com.cn/api/company/received_complaints'
        res = requests.get(url, params=params, verify=False)
        info_list = res.json()['result']['data']['complaints']
        for dict_info in info_list:
            dict_info['main']['url'] = 'https:' + dict_info['main']['url']
            dict_info['author']['avatar'] = 'https:' + dict_info['author']['avatar']
            info_url = dict_info['main']['url']
            print(info_url)
            res = session.get(info_url, verify=False)
            new_dict = dict()
            new_ Dict ['complaint number ']= res.html.xpath ('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[1]/text()')[0]
            new_ Dict ['complaint object ']= res.html.xpath ('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[2]/a/text()')[0]
            new_ Dict ['complaint problem ']= res.html.xpath ('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[3]/text()')[0]
            new_ Dict ['complaint request ']= res.html.xpath ('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[4]/text()')[0]
            new_ Dict ['amount involved in litigation ']= res.html.xpath ('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[5]/text()')[0]
            new_ Dict ['complaint progress']= res.html.xpath ('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[6]/b/text()')[0]
            # new_dict['a'] = res_dome.xpath('//*[@class="u-name"]/text()')
            # new_dict['b'] = res_dome.xpath('//*[@class="u-status"]/text()')
            new_ Dict ['complaint process details']= res.html.xpath ('//*[@class="ts-d-steplist"]')[0].text
            not_have_http_img_list = res.html.xpath('//*[@class="example-image-link"]/@href')
            have_http_img_list = []
            for a in not_have_http_img_list:
                have_http_img_list.append('https:' + a)
            new_ Dict ['complaint picture '] = have_ http_ img_ list

            vide_id_list = res.html.xpath('//*[@class="video-ico"]/@data-id')
            print(vide_id_list)
            new_vide_list = []
            if vide_id_list:
                for vide_id in vide_id_list:
                    t = int(time.time())
                    vide_info_url = f'https://api.ivideo.sina.com.cn/public/video/play?video_id={vide_id}&appver=V11220.191231.02&appname=sinaplayer_pc&applt=web&tags=sinaplayer_pc&player=all&jsonp=&plid=2019090301&prid=&uid=&tid=&pid=1&ran=0.34558379845592846&r=https%3A%2F%2Ftousu.sina.com.cn%2Fcomplaint%2Fview%2F17349160365%2F&referrer=&ssid=gusr_pc_{t}&preload=0&uu=60.180.189.200_1581579174.788519&isAuto=1'
                    res = session.get(vide_info_url, verify=False)
                    try:
                        new_vide_list.append(res.json())
                    except:
                        pass
            new_ Dict ['complaint video details'] = New_ vide_ list
            dict_ Info ['complaint details'] = New_ dict
            big_list.append(dict_info)
    except:
        Print ('error skipping this page ')

def run(page):
    Number of pages crawled
    for x in range(page):
        name = pool.submit(dewu_company,x)
        pool_name_list.append(name)
    for name_1 in pool_name_list:
        name_1.result()
    Print ('end all, start saving local ')
    With open (f'alibaba complaint information. JSON ', "W", encoding:' utf8 ') as FW:
        json.dump(big_list, fw)
    Print ('save finished ')

if __name__ == '__main__':
    run(1)