Python Selenium天猫商品爬取(上)

首先,先对爬取的进行分类,主要根据下方这幅图的标注。

主要流程如下:

核心代码如下:

import json

import pymysql
from lxml import etree

connect = pymysql.Connect(
    host='127.0.0.1',
    port=3306,
    user='root',
    passwd='XXXXXXXXXX',
    db='test',
    charset='utf8'
)
cursor = connect.cursor()


def insert(name, level, parentId, url):
    if parentId is None:
        sql = "insert into category(name,level) values('%s',%d);"
        sql = sql % (name, level)
    else:
        if url is None:
            sql = "insert into category(name,level,parent_id) values('%s',%d,%d);"
            sql = sql % (name, level, parentId)
        else:
            sql = "insert into category(name,level,parent_id,url) values('%s',%d,%d,'%s');"
            sql = sql % (name, level, parentId, url)
    print(sql)
    cursor.execute(sql)
    current_id = connect.insert_id()
    connect.commit()
    return current_id


def getContent(ele):
    result = ""
    if isinstance(ele, list):
        for i in range(0, len(ele)):
            item = ele[i]
            result = result + item.encode("ISO-8859-1").decode("utf-8")
            if i < len(ele) - 1:
                result = result + "/"
    else:
        result = ele.encode("ISO-8859-1").decode("utf-8")
    return result


html = etree.parse("index.html", etree.HTMLParser())

first = html.xpath("//*[@id='content']/div[2]/div[1]/div[2]/div[1]/ul/li")

categoryList = []
for temp in first:
    ele = temp.xpath("./a/text()")
    item = {}
    item['name'] = getContent(ele)
    categoryList.append(item)

second = html.xpath("//*[@id='content']/div[2]/div[1]/div[3]/div")

index = 0
for item_second in second:
    third = item_second.xpath("./div/div[1]/div")
    secondList = []
    for item_third in third:
        item = {}
        p = item_third.xpath("./div[1]/div/text()")
        item['name'] = getContent(p)
        item_third_detail = item_third.xpath("./div[2]/a")
        thirdList = []
        for item_detail in item_third_detail:
            temp_detail = {}
            temp_detail["url"] = str(item_detail.xpath("./@href")[0]).replace("'", "")
            temp_detail['name'] = getContent(item_detail.xpath("./text()"))
            thirdList.append(temp_detail)
        item['third'] = thirdList
        secondList.append(item)
    categoryList[index]['second'] = secondList
    index = index + 1

print(json.dumps(categoryList, ensure_ascii=False))

for cate in categoryList:
    firstId = insert(cate['name'], 1, None, None)
    for temp in cate['second']:
        secondId = insert(temp['name'], 2, firstId, None)
        for temp2 in temp['third']:
            insert(temp2['name'], 3, secondId, temp2['url'])

数据库结果如下:

注意点:

1、获取天猫首页元素的时候要对以及分类都悬浮一下,这样二级分类的元素才不会为空

2、注意编码,免得xpath解析失败

results matching ""

    No results matching ""