How Python implements IP proxy pool based on redis

Time:2020-9-15

This article mainly introduces how Python implements the IP proxy pool based on redis. The example code is introduced in detail, which has a certain reference learning value for everyone’s study or work. Friends in need can refer to it

The apscheduler library is used to crawl IP regularly, detect IP regularly, delete IP, and do two-layer detection. After crawling in the first layer, redis-db0 is put for detection, and redis-db1 is successfully put into redis-db1 for detection again to ensure the availability of the obtained proxy IP

import requests, redis
import pandas
import random

from apscheduler.schedulers.blocking import BlockingScheduler
import datetime
import logging

db_conn = redis.ConnectionPool(host="*.*.*.*", port=6379, password="123456")
redis_conn_0 = redis.Redis(connection_pool=db_conn, max_connections=10,db=0)
redis_conn_1 = redis.Redis(connection_pool=db_conn, max_connections=10,db=1)


#Delete IP in redis database
def remove_ip(ip,redis_conn):
  redis_conn.zrem("IP", ip)
  Print ("deleted% s..."% IP)


#How many IP addresses are there in the redis database
def get_ip_num(redis_conn):
  num = redis_conn.zcard("IP")
  return num


#Get IP port
def get_port(ip,redis_conn):
  port = redis_conn.zscore("IP", ip)
  port = str(port).replace(".0", "")
  return port


#Add IP and port to database
def add_ip(ip, port,redis_conn):
  #Nx: do not update existing elements. Always add new elements, only true, false
  redis_conn.zadd("IP", {ip: port}, nx=55)
  Print ("added% s% s... OK"% (IP, port))


#List all IP
def get_all_ip(redis_conn):
  all_ip = redis_conn.zrange("IP", 0, -1)
  return all_ip


#Get an IP randomly
def get_random_ip(redis_conn):
  end_num = get_ip_num(redis_conn)
  num = random.randint(0, end_num)
  random_ip = redis_conn.zrange("IP", num, num)
  if not random_ip:
    return "",""
  random_ip = str(random_ip[0]).replace("b", '').replace("'", "")
  port = get_port(random_ip,redis_conn)
  return random_ip, port


#Get proxy IP
def spider_ip(x,redis_conn):
  print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), x)
  for p in range(1, 20):
    res = pandas.read_html("http://www.89ip.cn/index_{}.html".format(p))
    # print(res)
    # print(type(res[0]))
    for i in range(len(res[0])):
      ip = res[0].iloc[i, 0]
      port = res[0].iloc[i, 1]
      print("ip", ip)
      print("port", port)
      add_ip(str(ip), str(port),redis_conn)


logging.basicConfig(level=logging.INFO,
          format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
          datefmt='%Y-%m-%d %H:%M:%S',
          filename='log1.txt',
          filemode='a')


def aps_detection_ip(x,redis_conn):
  print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), x)
  res=get_random_ip(redis_conn)
  ip=res[0]
  port=res[1]
  try:
    requests.get("http://www.baidu.com",proxies={'https':'{ip}:{port}'.format(ip=ip,port=port)})
    Print ("available", IP, port, RES)
    if redis_conn!=redis_conn_1:
      add_ip(str(ip), str(port), redis_conn_1)
  except Exception:
    #IP errors are deleted when they fail
    remove_ip(ip,redis_conn)


scheduler = BlockingScheduler()
scheduler.add_ job(func=aps_ detection_ IP, args = ('detect circular task 0 ', redis_ conn_ 0), trigger='interval', seconds=3, id='aps_ detection_ Ip_ task0',max_ instances=10)
scheduler.add_ job(func=spider_ IP, args = ('Get circular task 0 ', redis_ conn_ 0), trigger='interval', seconds=60*60*2, id='spider_ Ip_ task0',max_ instances=10)

scheduler.add_ job(func=aps_ detection_ IP, args = ('detect circular task 1 ', redis_ conn_ 1), trigger='interval', seconds=3, id='aps_ detection_ Ip_ task1',max_ instances=10)

scheduler._logger = logging

# scheduler.start()
if __name__ == '__main__':
  # print(get_ip_num())
  # spider_ IP ("get loop task")
  scheduler.start()
  # aps_ detection_ IP ("detect loop task")

The above is the whole content of this article, I hope to help you in your study, and I hope you can support developeppaer more.