豆果网数据爬取

常规网页的静态爬虫。

利用 csv 文件的字典格式 csv.DictWriter 存储数据

# -*- coding = uft-8 -*-

import csv
import time
import random
import requests
import traceback
from lxml import etree


# 获取首页源码
def get_page(url):
    n = 3
    while True:
        try:
            # sleep(random.uniform(1, 2))  # 随机出现1-2之间的数,包含小数
            headers = {
                'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"
            }
            response = requests.get(url, headers=headers, timeout=10)
            # print(response.text)
            return response.text
        except (TimeoutError, Exception):
            n -= 1
            if n == 0:
                print('请求3次均失败,放弃此url请求,检查请求条件')
                return
            else:
                print('请求失败,重新请求')
                continue


# 爬取一页数据
def parse_page(html, caipu):
    try:
        parse = etree.HTML(html)  # 解析网页
        items = parse.xpath('//li[@class="clearfix"]')
        for item in items:
            title = ''.join(item.xpath('./a/@title')).strip()
            href = 'https://www.douguo.com' + ''.join(item.xpath('./div/a/@href')).strip()
            peiliao = ''.join(item.xpath('./div/p/text()')).strip()
            rate = ''.join(item.xpath('./div/div[1]/span[2]/text()')).strip()
            id = ''.join(item.xpath('./div/div[2]/a[1]/text()')).strip()
            img = ''.join(item.xpath('./a/img/@src')).strip()
            item = {
                'title': title,
                'href': href,
                'peiliao': peiliao,
                'rate': rate,
                'id': id,
                'img': img,
                'caipu': caipu
            }
            # print(item)
            try:
                with open('./caipu.csv', 'a', encoding='utf_8_sig', newline='') as fp:
                    # 'a'为追加模式(添加)
                    # utf_8_sig格式导出csv不乱码
                    fieldnames = ['title', 'href', 'peiliao', 'rate', 'id', 'img', 'caipu']
                    writer = csv.DictWriter(fp, fieldnames)
                    writer.writerow(item)
            except Exception:
                print(traceback.print_exc())  # 代替print e 来输出详细的异常信息
    except Exception:
        print(traceback.print_exc())


# 主函数
def main(x):
    url = 'https://www.douguo.com/caipu/{}/0/{}'.format(caipu, x * 20)
    print(url)
    html = get_page(url)
    parse_page(html, caipu)


if __name__ == '__main__':
    caipu_list = ['川菜', '湘菜', '粤菜', '东北菜', '鲁菜', '浙菜', '湖北菜', '清真菜']  # 中国菜系
    start = time.time()  # 计时
    for caipu in caipu_list:
        for i in range(22):
            # 爬取多页
            main(x=i)
            time.sleep(random.uniform(1, 2))
            print(caipu, "第" + str(i + 1) + "页提取完成")
    end = time.time()
    print('共用时', round((end - start) / 60, 2), '分钟')
Update time: 2020-08-12

results matching ""

    No results matching ""