Thread pool
- Guide Pack:
from multiprocessing.dummy import Pool
- The callback function asynchronously performs some operation on the elements in an iteratable object
- Note: callback must have one parameter and only one parameter
- Asynchrony is mainly applied to time-consuming operations
from multiprocessing.dummy import Pool
Pool = pool (3) ා instantiate the thread pool object, 3 is the maximum number of threads in the thread pool
#Parameter 1: callback function (only function name, no parentheses); parameter 2: list
#Parameter 1 receives an element in the parameter 2 list, and the callback function can do some operation on the list element
pool.map(callback,list)
Testing: synchronous & asynchronous efficiency
Build a flag, start the service by yourself, and test the execution time
- Create a new one
server.py
from flask import Flask, render_template
import time
app = Flask(__name__)
@app.route('/xx')
def index_1():
time.sleep(2)
return render_template('test.html')
@app.route('/yy')
def index_2():
time.sleep(2)
return render_template('test.html')
@app.route('/oo')
def index_3():
time.sleep(2)
return render_template('test.html')
if __name__ == '__main__':
app.run(debug=True)
- Create a new one
templates
Folder, create an HTML file under this folder, what I write istest.html
Write some data at random
test
Keep the promise
Li Qingzhao
Wang Anshi
Su Shi
Liu Zongyuan
this is span
The Song Dynasty is the most powerful Dynasty, not the army, but the economy and the people are rich
The cloud can block out the sun, Chang'an is sad
Qingming season rain in succession, pedestrians on the road to break the soul, ask where the restaurant, shepherd boy pointing to Xinghua village
In the Qin Dynasty, the moon was in the Ming Dynasty, and the Han Dynasty was closed. The people of the long march did not return, but the flying generals of the dragon city were there, and they did not teach Hu Ma Du Yin Mountain
It's very common to see it in Qiwang's house. Cui jiutang has heard it a few times before. It's the beautiful scenery in the south of the Yangtze River
Du Fu
Du Mu
Du Xiaoyue
spend honeymoon
Phoenix stage phoenix tour, Phoenix to Taikong river flow, Wu palace flowers and plants buried path, Jin Dynasty clothing into ancient hills
Synchronous & asynchronous execution time
import requests
from bs4 import BeautifulSoup
import time
#Thread pool module
from multiprocessing.dummy import Pool
urls = [
'http://127.0.0.1:5000/xx',
'http://127.0.0.1:5000/yy',
'http://127.0.0.1:5000/oo',
]
#Data crawling, return the crawled page source data
def get_request(url):
page_text = requests.get(url=url).text
return page_text
#Data analysis, return the label text
def parse(page_text):
soup = BeautifulSoup(page_text, 'lxml')
return soup.select('#feng')[0].text
#Synchronization code
if __name__ == '__main__':
start = time.time()
for url in urls:
page_text = get_request(url)
text_data = parse(page_text)
print(text_data)
print(time.time() - start)
"""
Implementation results:
Phoenix stage phoenix tour, Phoenix to Taikong river flow, Wu palace flowers and plants buried path, Jin Dynasty clothing into ancient hills
Phoenix stage phoenix tour, Phoenix to Taikong river flow, Wu palace flowers and plants buried path, Jin Dynasty clothing into ancient hills
Phoenix stage phoenix tour, Phoenix to Taikong river flow, Wu palace flowers and plants buried path, Jin Dynasty clothing into ancient hills
6.056272029876709
"""
#Asynchronous code
if __name__ == '__main__':
start = time.time()
Pool = pool (3) ා instantiate thread pool object
#Parameter 1: callback function (only function name, no parentheses); parameter 2: list
#Parameter 1 receives an element in the parameter 2 list, and the callback function can do some operation on the list element
page_text_list = pool.map(get_request,urls)
text_data = pool.map(parse,page_text_list)
for i in text_data:
print(i)
print(time.time() - start)
"""
Implementation results:
Phoenix stage phoenix tour, Phoenix to Taikong river flow, Wu palace flowers and plants buried path, Jin Dynasty clothing into ancient hills
Phoenix stage phoenix tour, Phoenix to Taikong river flow, Wu palace flowers and plants buried path, Jin Dynasty clothing into ancient hills
Phoenix stage phoenix tour, Phoenix to Taikong river flow, Wu palace flowers and plants buried path, Jin Dynasty clothing into ancient hills
2.0537397861480713
The speed can be increased by about 0.01 seconds without the for loop
"""
To sum up, asynchronous code execution efficiency is significantly improved
Case: pear crawling video based on thread pool
- Thinking analysis
- Crawl to the URL corresponding to the video detail page and store it in an iteratable object
- Send the request again to get the real video address of the video detail page
- Note: the video of the video detail page is generated dynamically by JS code, and regular parsing is required
- Write a callback, get the video binary file, persistent storage
import requests
from lxml import etree
from multiprocessing.dummy import Pool
import re
import os
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
}
#Address of pear video wealth section
main_url = 'https://www.pearvideo.com/category_3'
#The SRC of the video detail page under the plate is analyzed
main_page_text = requests.get(url=main_url, headers=headers).text
tree = etree.HTML(main_page_text)
li_list = tree.xpath('//*[@id="listvideoListUl"]/li')
#Thread pool
video_urls = []
for li in li_list:
#Specific address and video title of video detail page
detail_url = "https://www.pearvideo.com/" + li.xpath('./div/a/@href')[0]
name = li.xpath('./div/a/div[2]/text()')[0]
#Request for details page
page_text = requests.get(url=detail_url, headers=headers).text
#The video of video detail page is dynamically generated by JS code, and regular parsing is used
ex = 'srcUrl="(.*?)",vdoUrl='
video_ url = re.findall (ex, page_ Text, re. S) [0] ා returns a list type
dic = {
'url': video_url,
'name': name,
}
video_urls.append(dic)
#Callback function
def get_video(url):
#Request the video address and store the binary file persistently
video_data = requests.get(url=url['url'], headers=headers).content
file_name = "./video/" + url['name'] + ".mp4"
with open(file_name, 'wb') as f:
f.write(video_data)
Print (URL ['name '], "download finished! ""
#Create a folder to store videos
dir_name = 'video'
if not os.path.exists(dir_name):
os.mkdir(dir_name)
#Instantiating thread pool
pool = Pool(4)
pool.map(get_video, video_urls)
Asynchronous coroutine of single thread and multitask
asyncio
(key points)
Special function
- If the definition of a function is decorated with the async keyword, the function is a special function.
- Special features:
- After the function is called, the implementation statement inside the function will not be executed immediately.
- This function returns a coroutine object
Synergetic process
-
A coroutine is an object. When a special function is called, the function returns a coroutine object.
-
Coroutine object = = special function
import asyncio from time import sleep async def get_request(url): Print ('requesting: ', URL) sleep(2) Print ('request succeeded: ', URL) return '666' #Returns a coroutine object g = get_request("https://www,qq.com")
Task object
-
It is a further encapsulation of the coroutine object (that is, a high-level coroutine object)
-
Task object = = coroutine object = = special function (representing a fixed form task)
asyncio.ensure_ Future (coroutine object) task = asyncio.ensure_future(g) #G: coroutine object
-
Binding callback:
#Define a callback function for a task def callback(task): task.result () ා represents the return value of the special function corresponding to the current task object print("I'm callback:", task) task.add_done_callback(funcName) #Task: task object #Funcname: the name of the callback function
funcName
This callback function must take a parameter, which represents the current task objectParameter. Result ()
: represents the return value of the special function corresponding to the current task object
Event loop object
-
Create event loop object
-
The task object needs to be registered with the event loop object
#Create event loop object loop = asyncio.get_event_loop() #Register / load the task object into the event loop object, and then start the loop object loop.run_ until_ Complete (task) ා used to load and start an event loop #Task: task object
wait for
await
: when the blocking operation is finished, let the loop go back to execute the code after blocking.
Hang up
asyncio.wait()
: hand over the current task object to the CPU.
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
asyncio.wait #Suspend operation
Tasks ා task object list
Key points for attention
- Module code that does not support asynchronism cannot appear in the special function implementation, otherwise, the asynchronous effect will be interrupted
aiohttp
(key points)
-
requests
: asynchronous is not supported and cannot appear inside special functions. -
aiohttp
: support asynchronous network request module, andasyncio
Use togetherpip install aiohttp
-
Code writing
- Write the basic structure
import asyncio import aiohttp #Implementation of asynchronous network request based on aiohttp async def get_requests(url): #A request object is instantiated with aiohttp.ClientSession() as aio: # with aio.get/post(url=url,headers=headers,data/params,proxy='http://ip:prot') as response: with aio.get(url=url) as response: #Text() gets the response data in string form #Read() gets the response data of type bytes page_text = await response.text() return page_text
- Details supplement (code refers to complete code)
- In every one of them
with
Add beforeasync
keyword - Add before each blocking operation
await
keyword
- In every one of them
-
Complete code
import asyncio import aiohttp #Implementation of asynchronous network request based on aiohttp async def get_requests(url): #A request object is instantiated async with aiohttp.ClientSession() as aio: # with aio.get/post(url=url,headers=headers,data/params,proxy='http://ip:prot') as response: async with await aio.get(url=url) as response: #Text() gets the response data in string form #Read() gets the response data of type bytes page_text = await response.text() return page_text
Single task scheduling operation
import asyncio
from time import sleep
async def get_request(url):
Print ('requesting: ', URL)
sleep(2)
Print ('request succeeded: ', URL)
return '666'
#Define a callback function for a task
def callback(task):
print("I'm callback:", task)
#Returns a coroutine object
g = get_request("https://www,qq.com")
#Create a task object
task = asyncio.ensure_future(g)
"""
#Bind callback function to task object
task.add_done_callback(callback)
#Create event loop object
loop = asyncio.get_event_loop()
#Register / load the task object into the event loop object, and then start the loop object
loop.run_ until_ Complete (task) ා used to load and start an event loop
"""
Implementation results:
Requesting: WWW, qq.com
Requesting: WWW, qq.com
"""
Multitask scheduling operation
import asyncio
import time
start = time.time()
async def get_request(url):
Print ('requesting: ', URL)
#Await when the blocking operation is over, let the loop go back to execute the blocked code
await asyncio.sleep(2)
Print ('request succeeded: ', URL)
return '666'
urls = [
'http://127.0.0.1:5000/xx',
'http://127.0.0.1:5000/yy',
'http://127.0.0.1:5000/oo',
]
tasks = []
for url in urls:
c = get_request(url)
task = asyncio.ensure_future(c)
tasks.append(task)
loop = asyncio.get_event_loop()
#When registering a task list to an event loop, be sure to suspend the task list
Wei asyncio.wait () suspend the operation and hand over the current task object to the CPU
loop.run_until_complete(asyncio.wait(tasks))
Print ('total time consumption: ', time.time () - start)
Single thread & multitask asynchronous crawler
Self test based on flag
- The test code is shown above
Testing: synchronous & asynchronous efficiency
To start the project, follow the steps above; then run the code below.
import asyncio
import time
import aiohttp
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
}
urls = [
'http://127.0.0.1:5000/xx',
'http://127.0.0.1:5000/yy',
'http://127.0.0.1:5000/oo',
]
start = time.time()
"""
#Initiate a request to get response data (asynchronism is not possible)
async def get_requests(url):
#Requests is a module that does not support asynchrony
page_text = requests.get(url).text
return page_text
"""
async def get_requests(url):
"""
Implementation of asynchronous network request based on aiohttp
:param url:
:return:
"""
#A request object is instantiated
async with aiohttp.ClientSession() as aio:
# with aio.get/post(url=url,headers=headers,data/params,proxy='http://ip:prot') as response:
async with await aio.get(url=url) as response:
#Text() gets the response data in string form
#Read() gets the response data of type bytes
page_text = await response.text()
return page_text
def parse(task):
"""
Define callback function
:param task:
:return:
"""
page_ text = task.result () ා get the return value of the special function (the requested page source data)
tree = etree.HTML(page_text)
content = tree.xpath('//*[@id="feng"]/text()')[0]
print(content)
tasks = []
for url in urls:
c = get_requests(url)
task = asyncio.ensure_future(c)
task.add_done_callback(parse)
tasks.append(task)
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
Print ('total time consumption: ', time.time () - start)
Case: asynchronous pear crawling video based on single thread and multitask
- Thinking above
Case: pear crawling video based on thread pool
import asyncio
import time
import aiohttp
from lxml import etree
import re
import os
import requests
#The time module is to test the time consumption of crawling video
start = time.time()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
}
#Address of pear video wealth section
main_url = 'https://www.pearvideo.com/category_3'
main_page_text = requests.get(url=main_url, headers=headers).text
tree = etree.HTML(main_page_text)
li_list = tree.xpath('//*[@id="listvideoListUl"]/li')
urls = [] # [{'url': video_url,'name': name},{}...]
for li in li_list:
detail_url = "https://www.pearvideo.com/" + li.xpath('./div/a/@href')[0]
name = li.xpath('./div/a/div[2]/text()')[0]
page_text = requests.get(url=detail_url, headers=headers).text
#The video of video detail page is dynamically generated by JS code
ex = 'srcUrl="(.*?)",vdoUrl='
video_ url = re.findall (ex, page_ Text, re. S) [0] ා returns a list type
dic = {
'url': video_url,
'name': name,
}
urls.append(dic)
#Implementation of asynchronous network request based on aiohttp
async def get_requests(url):
#A request object is instantiated
async with aiohttp.ClientSession() as aio:
# with aio.get/post(url=url,headers=headers,data/params,proxy='http://ip:prot') as response:
async with await aio.get(url=url['url'], headers=headers) as response:
#Text() gets the response data in string form
#Read() gets the response data of type bytes
page_read = await response.read()
dic = {
"page_read": page_read,
"name": url['name']
}
return dic
def parse(task):
"""
Define callback function
:param task:
:return:
"""
dic_ info = task.result () ා get the return value of the special function (the requested page source data)
file_name = "./video/" + dic_info["name"] + ".mp4"
with open(file_name, 'wb') as f:
f.write(dic_info['page_read'])
print(dic_ "Name" ["info"]! ""
tasks = []
for url in urls:
c = get_requests(url)
task = asyncio.ensure_future(c)
task.add_done_callback(parse)
tasks.append(task)
dir_name = 'video'
if not os.path.exists(dir_name):
os.mkdir(dir_name)
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
Print ('total time consumption: ', time.time () - start)