Article catalog
Hero League Python crawler
Hero main interface QQhttps://lol.qq.com/data/info-heros.shtml
1. Hero crawling
https://lol.qq.com/data/info-heros.shtml
Get method to get the specified hero information.
https://lol.qq.com/data/info-heros.shtml?id=xxx
id=xxx
2. JS obtains all hero information
import json
import requests
from faker import Factory
from bs4 import BeautifulSoup
f = Factory.create()
def get_all_heros():
url = 'https://game.gtimg.cn/images/lol/act/img/js/heroList/hero_list.js'
headers = {
'user-agent': f.user_agent()
}
r = requests.get(url, headers=headers)
r.encoding = r.apparent_encoding
c = r.text
l = json.loads(c)['hero']
for i in l[:50]:
Print ("ID: {0} Name: {1} alias: {2}". Format (I ['heroid '], I ['name'], I ['alias'])
if __name__ == '__main__':
get_all_heros()
effect:
3. Crawl the game data
First lol page crawl
http://www.wanplus.com/lol/playerstats
When CSRF token is used, the post request needs to carry the CSRF token in the set cookies.
import json
import time
import requests
from faker import Factory
from urllib import parse
f = Factory.create()
def get_token():
url = 'http://www.wanplus.com/lol/playerstats'
headers = {
'user-agent': f.user_agent(),
'Referer': 'http://www.wanplus.com/lol/teamstats',
'Host': 'www.wanplus.com',
}
r = requests.get(url, headers=headers, allow_redirects=False)
r.encoding = r.apparent_encoding
c = r.cookies
r.close()
myCookies = c.get_dict()
# print(myCookies)
return str(int(c.get('wanplus_csrf')[9:]) + int(16777216)), myCookies
def get_competition():
url = 'http://www.wanplus.com/ajax/stats/list'
token, myCookies = get_token()
headers = {
'user-agent': f.user_agent(),
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Host': 'www.wanplus.com',
'Origin': 'http://www.wanplus.com',
'Referer': 'http://www.wanplus.com/lol/playerstats',
'X-CSRF-Token': token,
'X-Requested-With': 'XMLHttpRequest',
}
formdata = {
'_gtk': token,
'draw': '1',
'columns[0][data]': 'order',
'columns[0][name]': '',
'columns[0][searchable]': 'true',
'columns[0][orderable]': 'false',
'columns[0][search][value]': '',
'columns[0][search][regex]': 'false',
'columns[1][data]': 'playername',
'columns[1][name]': '',
'columns[1][searchable]': 'true',
'columns[1][orderable]': 'false',
'columns[1][search][value]': '',
'columns[1][search][regex]': 'false',
'columns[2][data]': 'teamname',
'columns[2][name]': '',
'columns[2][searchable]': 'true',
'columns[2][orderable]': 'false',
'columns[2][search][value]': '',
'columns[2][search][regex]': 'false',
'columns[3][data]': 'meta',
'columns[3][name]': '',
'columns[3][searchable]': 'true',
'columns[3][orderable]': 'false',
'columns[3][search][value]': '',
'columns[3][search][regex]': 'false',
'columns[4][data]': 'appearedTimes',
'columns[4][name]': '',
'columns[4][searchable]': 'true',
'columns[4][orderable]': 'true',
'columns[4][search][value]': '',
'columns[4][search][regex]': 'false',
'columns[5][data]': 'kda',
'columns[5][name]': '',
'columns[5][searchable]': 'true',
'columns[5][orderable]': 'true',
'columns[5][search][value]': '',
'columns[5][search][regex]': 'false',
'columns[6][data]': 'attendrate',
'columns[6][name]': '',
'columns[6][searchable]': 'true',
'columns[6][orderable]': 'true',
'columns[6][search][value]': '',
'columns[6][search][regex]': 'false',
'columns[7][data]': 'killsPergame',
'columns[7][name]': '',
'columns[7][searchable]': 'true',
'columns[7][orderable]': 'true',
'columns[7][search][value]': '',
'columns[7][search][regex]': 'false',
'columns[8][data]': 'mostkills',
'columns[8][name]': '',
'columns[8][searchable]': 'true',
'columns[8][orderable]': 'true',
'columns[8][search][value]': '',
'columns[8][search][regex]': 'false',
'columns[9][data]': 'deathsPergame',
'columns[9][name]': '',
'columns[9][searchable]': 'true',
'columns[9][orderable]': 'true',
'columns[9][search][value]': '',
'columns[9][search][regex]': 'false',
'columns[10][data]': 'mostdeaths',
'columns[10][name]': '',
'columns[10][searchable]': 'true',
'columns[10][orderable]': 'true',
'columns[10][search][value]': '',
'columns[10][search][regex]': 'false',
'columns[11][data]': 'assistsPergame',
'columns[11][name]': '',
'columns[11][searchable]': 'true',
'columns[11][orderable]': 'true',
'columns[11][search][value]': '',
'columns[11][search][regex]': 'false',
'columns[12][data]': 'mostassists',
'columns[12][name]': '',
'columns[12][searchable]': 'true',
'columns[12][orderable]': 'true',
'columns[12][search][value]': '',
'columns[12][search][regex]': 'false',
'columns[13][data]': 'goldsPermin',
'columns[13][name]': '',
'columns[13][searchable]': 'true',
'columns[13][orderable]': 'true',
'columns[13][search][value]': '',
'columns[13][search][regex]': 'false',
'columns[14][data]': 'lasthitPermin',
'columns[14][name]': '',
'columns[14][searchable]': 'true',
'columns[14][orderable]': 'true',
'columns[14][search][value]': '',
'columns[14][search][regex]': 'false',
'columns[15][data]': 'damagetoheroPermin',
'columns[15][name]': '',
'columns[15][searchable]': 'true',
'columns[15][orderable]': 'true',
'columns[15][search][value]': '',
'columns[15][search][regex]': 'false',
'columns[16][data]': 'damagetoheroPercent',
'columns[16][name]': '',
'columns[16][searchable]': 'true',
'columns[16][orderable]': 'true',
'columns[16][search][value]': '',
'columns[16][search][regex]': 'false',
'columns[17][data]': 'damagetakenPermin',
'columns[17][name]': '',
'columns[17][searchable]': 'true',
'columns[17][orderable]': 'true',
'columns[17][search][value]': '',
'columns[17][search][regex]': 'false',
'columns[18][data]': 'damagetakenPercent',
'columns[18][name]': '',
'columns[18][searchable]': 'true',
'columns[18][orderable]': 'true',
'columns[18][search][value]': '',
'columns[18][search][regex]': 'false',
'columns[19][data]': 'wardsplacedPermin',
'columns[19][name]': '',
'columns[19][searchable]': 'true',
'columns[19][orderable]': 'true',
'columns[19][search][value]': '',
'columns[19][search][regex]': 'false',
'columns[20][data]': 'wardskilledPermin',
'columns[20][name]': '',
'columns[20][searchable]': 'true',
'columns[20][orderable]': 'true',
'columns[20][search][value]': '',
'columns[20][search][regex]': 'false',
'order[0][column]': '4',
'order[0][dir]': 'desc',
'start': '0',
'length': '20',
'search[value]': '',
'search[regex]': 'false',
'area': '',
'eid': '1065',
'type': 'player',
'gametype': '2',
'filter': '{"team":{},"player":{},"meta":{}}',
}
#Dictionary converted to K1 = V1 & K2 = V2
data = parse.urlencode(formdata)
# print(data)
r = requests.post(url, cookies=myCookies, data=data, headers=headers, allow_redirects=False)
r.encoding = r.apparent_encoding
c = r.text
#Print ("11111 contents are as follows: ----------------------------------------------")
if len(c) < 100:
Print ('Get failed, get again! ')
return False
Print ('Get success! ')
l = json.loads(c)['data']
for i in l[:20]:
Print ('team number: {0} team name: {1} player name: {2} '. Format (['team'], I ['teamname '], I ['playername'])
return True
def cookie_to_dic(mycookie):
dic = {}
for i in mycookie.split('; '):
dic[i.split('=')[0]] = i.split('=')[1]
return dic
if __name__ == '__main__':
while 1:
ok = get_competition()
if ok is True:
break
# test()
The second lol web page data crawling
http://lol.admin.pentaq.com/
No anti crawling and CSRF token authentication:
from faker import Factory
import requests
import json
f = Factory.create()
def fun():
url = 'http://lol.admin.pentaq.com/api/tournament_team_data?tour=29&patch='
headers = {
'user-agent': f.user_agent()
}
r = requests.get(url, headers=headers)
r.encoding = r.apparent_encoding
c = r.text
r.close()
l = json.loads(c)['data']['teams_data']
for i in l[:20]:
Print ("team name: {0} team ID: {1} win: {2}". Format (I ['team_full_name '], I ['team_id'], I ['win '])
if __name__ == '__main__':
fun()
The third lol web page data crawling
http://www.op.gg/champion/statistics
Beautiful soup is enough.
from faker import Factory
import requests
from bs4 import BeautifulSoup
f = Factory.create()
def fun():
url = 'http://www.op.gg/champion/statistics'
headers = {
'user-agent': f.user_agent(),
'Accept-Language': "zh-CN,zh;q=0.9,en;q=0.8'"
}
r = requests.get(url, headers=headers)
r.encoding = r.apparent_encoding
if r.status_code != 200:
return False
c = r.text
r.close()
# print(c)
if len(c) < 10000:
return False
html = BeautifulSoup(c, 'html.parser')
l = html.find('tbody', class_='tabItem champion-trend-tier-TOP').find_all('tr')
for x in l[:5]:
a = x.find_all('td')
tmp = a[3]
b = tmp.find_all('div')
name = b[0].text
pos = b[1].text.replace('\t','').replace('\n','')
Print ('rank: {0} Name: {1} pos: {2} winning rate: {3} appearance rate: {4} '. Format (a [0]. Text, name, POS, a [4]. Text, a [5]. Text))
return True
# for c in l[:20]:
# a = c.find_all('td')
# tmp = a[3]
# b = tmp.find_all('div')
# name = b[0].text
# pos = b[1].text
#Print ('rank: {0] name: {1} pos: {2} winning rate: {3} appearance rate: {4} '. Format (a [0]. Text, name, POS, a [4]. Text, a [5]. Text))
if __name__ == '__main__':
while True:
ok = fun()
if ok:
break
4. Multi thread crawling lol hero skin image
1. Get the list of corresponding hero URLs and function get_ url_ list()
2. Download the corresponding picture and save it to the folder download ()
3. Main() enables multithreading to perform crawling tasks
import requests
import json
import os
from faker import Factory
from multiprocessing.dummy import Pool as ThreadPool
import time
f = Factory.create()
headers = {
'user-agent': f.user_agent()
}
def get_url_list():
url = 'https://game.gtimg.cn/images/lol/act/img/js/heroList/hero_list.js'
r = requests.get(url, headers=headers)
r.encoding = r.apparent_encoding
c = r.text
Heros = json. Loads (c) ["hero"] # 156 hero messages
idList = []
for hero in Heros:
hero_id = hero["heroId"]
idList.append(hero_id)
# print(idList)
def spider(url):
r = requests.get(url, headers=headers)
r.encoding = r.apparent_encoding
c = r.text
r.close()
res_dict = json.loads(c)
skins = res_ Dict ["skins"] # 15 hero messages
For index, hero in enumerate (skins): # here, use enumerate to obtain subscripts for naming files and pictures;
Item = {} # dictionary object
item['name'] = hero["heroName"]
item['skin_name'] = hero["name"]
if hero["mainImg"] == '':
continue
item['imgLink'] = hero["mainImg"]
# print(item)
download(index + 1, item)
def download(index, contdict):
name = contdict['name']
Path = "skin /" + name
if not os.path.exists(path):
os.makedirs(path)
content = requests.get(contdict['imgLink'], headers=headers).content
With open ('.. / skin /' + name + '/' + contract ['skin_name '] + str (index) +' jpg', 'wb') as f:
f.write(content)
def main():
start = time.time()
pool = ThreadPool(6)
page = []
for i in range(1, 11):
newpage = 'https://game.gtimg.cn/images/lol/act/img/js/hero/{}.js'.format(i)
print(newpage)
page.append(newpage)
result = pool.map(spider, page)
pool.close()
pool.join()
end = time.time()
Print ('time: ', end start)
if __name__ == '__main__':
main()