Python crawler — using requests to capture Baidu Post Bar novels, geek College

Time:2021-1-16

1. Grab interface

def getHtml(url):
    #Structural head
    headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'}
    #Getting Web pages with requests
    gethtml=requests.get(url,headers=headers)
    return gethtml.text

2. Generate multi page list

def changeurl(start_url,page):    
    urls=[]
    for i in range(1,page+1):
        url=start_url+str(i)
        urls.append(url)
    return urls

3. Capture the novel code of Baidu Post Bar

# -*- coding:utf-8 -*-
import requests
import re
#Make the printed content in Chinese
reload(__import__('sys')).setdefaultencoding('utf-8')


def changeurl(start_ URL, page): # parameters (starting URL, number of pages)  
    urls=[]
    for i in range(1,page+1):
        url=start_url+str(i)
        urls.append(url)
    return urls
    
def getHtml(url):
    #Structural head
    headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'}
    #Getting Web pages with requests
    gethtml=requests.get(url,headers=headers)
    return gethtml.text
#Open( wangyuan.txt (write in 'W +'mode)
f=open('wangyuan.txt','w+')
#Start URL
start_url="http://tieba.baidu.com/p/3826846894?see_lz=1&pn="
#All connections are made by calling changurl()
all_link=changeurl(start_url,3)
#Traverse each connection
for link in all_link:
    #Web source call gethtml (incoming connection)
    Yuanma=getHtml(link)
    #All content with regular Baidu Post Bar to grab every floor
    neirongs=re.findall('<div id="post_content_.*?" class="d_post_content .*?">            (.*?)</div>',Yuanma,re.S)
    #Traverse the contents of each floor
    for neirong in neirongs:
        #Deal with the content again
        neirong=neirong.replace('<br>','')
        neirong=neirong.replace('<img class="BDE_Image" pic_type="0" width="560" height="395" pic_ext="jpeg"  ><img class="BDE_Image" pic_type="0" width="560" height="150" pic_ext="jpeg"  >','')
        neirong=re.sub('<a href=.*?</a>',"",neirong,re.S)
        f.write(neirong)
#Close the file
f.close()

The results show that: 1
Python crawler -- using requests to capture Baidu Post Bar novels, geek College

4. Grasp the contents of geek College

4.1 block regularization of climbing geek college courses

def geteveryclass(html):
    everyclass = re.findall('(<li id=.*?</li>)',html,re.S)
    return everyclass

4.2 getting information from each course block

def getinfo(eachclass):
    Info = {} dictionary
    info['title']= re.search('<h2 class="lesson-info-h2"><a href=".*?" target="_blank" jktag=".*?">(.*?)</a>',eachclass,re.S).group(1)
    info['content']=re.search('''<p style=.*?>(.*?)</p>''',eachclass,re.S).group(1)
    timeandlevel = re.findall('<em>(.*?)</em>',eachclass,re.S)
    info['classtime']=timeandlevel[0]
    info['classlevel'] = timeandlevel[1]
    info['learnnum'] = re.search('<em class="learn-number".*?>(.*?)</em>',eachclass,re.S).group(1)
    return info

4.3 saving information quickly

def saveinfo(classinfo):
    f = open('info.txt','w+')
    for each in classinfo:
        f.writelines('title:' + each['title'] + '\n')
        f.writelines('content:' + each['content'] + '\n')
        f.writelines('classtime:' + each['classtime'] + '\n')
        f.writelines('classlevel:' + each['classlevel'] + '\n')
        f.writelines('learnnum:' + each['learnnum'] +'\n\n\n')
        f.writelines("======================")

4.4 main program code

classinfo=[]
#Initial connection
start_url='http://www.jikexueyuan.com/course/?pageNum='
#Generate all connections with changeurl
all_links=changeurl(start_url,2)
for link in all_ Links: # traverse all connections
    Print u 'in process' + link
    #Get the source code from the connection
    html=getsource(link)
    #Source code to get all the course information list
    everyclass=geteveryclass(html)
    #Traverse the course information list, each course information
    for ench in everyclass:
        #Get information (title, content) for each course
        neirong=getinfo(ench)
        
        classinfo.append(neirong)
        
saveinfo(classinfo)

4.5 general generation

# -*- coding:utf-8 -*-
import requests
import re
reload(__import__('sys')).setdefaultencoding('utf-8')
#��ȡ��ҳԴ����
def getsource(url):
    html = requests.get(url)
    return html.text
#����ͬ����ҳ����
def changeurl(start_url,page):    
    urls=[]
    for i in range(1,page+1):
        url=start_url+str(i)
        urls.append(url)
    return urls
#�õ�ÿ���γ���Ϣ
def geteveryclass(html):
    everyclass = re.findall('(<li id=.*?</li>)',html,re.S)
    return everyclass
#There's a problem
def getinfo(eachclass):
    Info = {} dictionary
    info['title']= re.search('<h2 class="lesson-info-h2"><a href=".*?" target="_blank" jktag=".*?">(.*?)</a>',eachclass,re.S).group(1)
    info['content']=re.search('''<p style=.*?>(.*?)</p>''',eachclass,re.S).group(1)
    timeandlevel = re.findall('<em>(.*?)</em>',eachclass,re.S)
    info['classtime']=timeandlevel[0]
    info['classlevel'] = timeandlevel[1]
    info['learnnum'] = re.search('<em class="learn-number".*?>(.*?)</em>',eachclass,re.S).group(1)
    '''
    content = re.search(<p style=.*?>(.*?)</p>,eachclass,re.S)
        
  
    timeandlevel = re.findall('<em>(.*?)</em>',eachclass,re.S)
    info['classtime'] = timeandlevel[0]
    info['classlevel'] = timeandlevel[1]
    info['learnnum'] = re.search('"learn-number">(.*?)</em>',eachclass,re.S).group(1)
    '''
    return info
def saveinfo(classinfo):
    f = open('info.txt','w+')
    for each in classinfo:
        f.writelines('title:' + each['title'] + '\n')
        f.writelines('content:' + each['content'] + '\n')
        f.writelines('classtime:' + each['classtime'] + '\n')
        f.writelines('classlevel:' + each['classlevel'] + '\n')
        f.writelines('learnnum:' + each['learnnum'] +'\n\n\n')
        f.writelines("======================")
classinfo=[]
#Initial connection
start_url='http://www.jikexueyuan.com/course/?pageNum='
#Generate all connections with changeurl
all_links=changeurl(start_url,2)
for link in all_ Links: # traverse all connections
    Print u 'in process' + link
    #Get the source code from the connection
    html=getsource(link)
    #Source code to get all the course information list
    everyclass=geteveryclass(html)
    #Traverse the course information list, each course information
    for ench in everyclass:
        #Get information (title, content) for each course
        neirong=getinfo(ench)
        
        classinfo.append(neirong)
        
saveinfo(classinfo)

The results show that: 1
Python crawler -- using requests to capture Baidu Post Bar novels, geek College

Follow up notes: Python > 3.4

# -*- coding:utf-8 -*-
import requests
import re
import importlib
import sys
importlib.reload(sys)

Then we can deal with Chinese

Recommended Today

JS function

1. Ordinary function Grammar: Function function name (){ Statement block } 2. Functions with parameters Grammar: Function function name (parameter list){ Statement block } 3. Function with return value Grammar: Function function name (parameter list){ Statement block; Return value; } Allow a variable to accept the return value after calling the function Var variable name […]