Crawler training project: get the movie with the highest score of Douban and download it

Time:2020-6-25

Pre review

In the last blog, we learned four libraries of Python CrawlersurllibrequestsBeautifulSoupas well asselenium
Introduction to reptile common library

  • LearningurllibAndrequestCommon usage of
  • Learned to useBeautifulSoupTo parse web pages and useseleniumTo drive the browser
#We imported the web driver module
from selenium import webdriver
#And then we created a chrome driver
driver = webdriver.Chrome()
#Then use get method to open Baidu
driver.get("https://www.baidu.com")
#Get the input box and write in the content we want to search
input = driver.find_element_by_css_selector('#kw')
input.send_ Keys ("photo of potono's undressing")
#We get the search button and click
button = driver.find_element_by_css_selector('#su')
button.click()

This is the code of the last time I checked the picture of Mr. Bordeaux. The effect is as follows
Crawler training project: get the movie with the highest score of Douban and download it

Crawler training project: get the movie with the highest score of Douban and download it

Capture Douban film and save local

Let’s grab the top 250 movies on Douban

import requests
from bs4 import BeautifulSoup
import xlwt
Jiaqun: 45692667, get more learning materials, hand training programs and learning atmosphere

def request_douban(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
    except requests.RequestException:
        return None


book = xlwt.Workbook(encoding='utf-8', style_compression=0)

sheet =  book.add_ Sheet ('douban movie top250 ', cell_ overwrite_ ok=True)
sheet.write (0, 0, 'name')
sheet.write (0, 1, 'picture')
sheet.write (0, 2, 'ranking')
sheet.write (0, 3, 'rating')
sheet.write (0, 4, 'author')
sheet.write (0, 5, 'Introduction')

n = 1


def save_to_excel(soup):
    list = soup.find(class_='grid_view').find_all('li')

    for item in list:
        item_name = item.find(class_='title').string
        item_img = item.find('a').find('img').get('src')
        item_index = item.find(class_='').string
        item_score = item.find(class_='rating_num').string
        item_author = item.find('p').text
        if (item.find(class_='inq') != None):
            item_intr = item.find(class_='inq').string

        #Print ('crawl movie: '+ item_ index + ' | ' + item_ name +' | ' + item_ img +' | ' + item_ score +' | ' + item_ author +' | ' + item_ intr )
        Print ('crawl movie: '+ item_ index + ' | ' + item_ name + ' | ' + item_ score + ' | ' + item_ intr)

        global n

        sheet.write(n, 0, item_name)
        sheet.write(n, 1, item_img)
        sheet.write(n, 2, item_index)
        sheet.write(n, 3, item_score)
        sheet.write(n, 4, item_author)
        sheet.write(n, 5, item_intr)

        n = n + 1


def main(page):
    url = 'https://movie.douban.com/top250?start=' + str(page * 25) + '&filter='
    html = request_douban(url)
    soup = BeautifulSoup(html, 'lxml')
    save_to_excel(soup)


if __name__ == '__main__':

    for i in range(0, 10):
        main(i)

book.save (u'douban's most popular 250 films. CSV ')

code analysis

Import related libraries first

import requests
#Request Web Library
from bs4 import BeautifulSoup
#Parse web page Library
import xlwt
#Interact with Excel file

Define a function to request a web page

def request_douban(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
    except requests.RequestException:
        return None

Create an Excel to store data

book = xlwt.Workbook(encoding='utf-8', style_compression=0)

sheet =  book.add_ Sheet ('douban movie top250 ', cell_ overwrite_ ok=True)
sheet.write (0, 0, 'name')
sheet.write (0, 1, 'picture')
sheet.write (0, 2, 'ranking')
sheet.write (0, 3, 'rating')
sheet.write (0, 4, 'author')
sheet.write (0, 5, 'Introduction')

n = 1

Define a function to save the data from beautifulsop to excel

def save_to_excel(soup):
    list = soup.find(class_='grid_view').find_all('li')

    for item in list:
        item_name = item.find(class_='title').string
        item_img = item.find('a').find('img').get('src')
        item_index = item.find(class_='').string
        item_score = item.find(class_='rating_num').string
        item_author = item.find('p').text
        if (item.find(class_='inq') != None):
            item_intr = item.find(class_='inq').string

        #Print ('crawl movie: '+ item_ index + ' | ' + item_ name +' | ' + item_ img +' | ' + item_ score +' | ' + item_ author +' | ' + item_ intr )
        Print ('crawl movie: '+ item_ index + ' | ' + item_ name + ' | ' + item_ score + ' | ' + item_ intr)

        global n

        sheet.write(n, 0, item_name)
        sheet.write(n, 1, item_img)
        sheet.write(n, 2, item_index)
        sheet.write(n, 3, item_score)
        sheet.write(n, 4, item_author)
        sheet.write(n, 5, item_intr)

        n = n + 1

Define the main function incoming URL and store, call the main function

def main(page):
    url = 'https://movie.douban.com/top250?start=' + str(page * 25) + '&filter='
    html = request_douban(url)
    soup = BeautifulSoup(html, 'lxml')
    save_to_excel(soup)


if __name__ == '__main__':

    for i in range(0, 10):
        main(i)

After running, I found that the file “Douban’s 250 most popular movies. CSV” was added to the folder. Open it and have a look

Crawler training project: get the movie with the highest score of Douban and download it