python爬虫

ninehua 2024-12-06 18:07 33 浏览

a# -*- coding: utf-8 -*-
"""
Created on Tue Oct 22 10:41:23 2024

@author: 1
"""

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium_stealth import stealth
import time
import random
import requests
from selenium.webdriver.chrome.options import Options
import re
import json

# 设置 ChromeDriver 的路径
chrome_driver_path = "D:\\chromedriver\\chromedriver.exe"

# 创建 ChromeOptions 对象
chrome_options = Options()

# 指定 Chrome 浏览器的路径
chrome_options.binary_location = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"

# 修改 User-Agent
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36")

# 禁用自动化扩展
chrome_options.add_argument("--disable-extensions")

# 禁用 GPU 加速
chrome_options.add_argument("--disable-gpu")

# 禁用日志
chrome_options.add_argument("--log-level=3")

# 禁用沙箱
chrome_options.add_argument("--no-sandbox")

# 禁用插件加载
chrome_options.add_argument("--disable-plugins")

# 禁用密码保存提示
chrome_options.add_argument("--password-store=basic")

# 禁用弹出窗口
chrome_options.add_argument("--disable-popup-blocking")

# 禁用通知
chrome_options.add_argument("--disable-notifications")

# 禁用音频
chrome_options.add_argument("--mute-audio")

# 使用代理（可选）
# chrome_options.add_argument("--proxy-server=your_proxy_server:port")

# 创建 ChromeDriver 服务
service = Service(chrome_driver_path)

# 创建 WebDriver 对象
# 配置 Selenium 使用代理
driver = webdriver.Chrome(service=service, options=chrome_options)

# 使用 selenium-stealth 避免检测
stealth(driver,
        languages=["en-US", "en"],
        vendor="Google Inc.",
        platform="Win32",
        webgl_vendor="Intel Inc.",
        renderer="Intel Iris OpenGL Engine",
        fix_hairline=True,
        )


try:
    # 访问网页
    driver.get("https:///")

    # 检查是否发生了重定向
    final_url = driver.current_url
    print(final_url)
    if final_url != "https://":
        print(f"发生了重定向，最终 URL: {final_url}")
        # 如果需要，可以在这里处理重定向后的 URL
        driver.get(final_url)

    # 禁用 navigator.webdriver 检测
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")

    # 随机化页面加载时间
    random_time = random.uniform(1, 4)
    time.sleep(random_time)

    # 模拟鼠标和键盘事件
    element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.NAME, "username"))
    )
    actions = webdriver.ActionChains(driver)
    actions.move_to_element(element)
    actions.pause(random.uniform(0.5, 1.5))
    actions.click()
    actions.pause(random.uniform(0.5, 1.5))
    actions.send_keys("username")
    
    element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.NAME, "password"))
    )
    
    actions.move_to_element(element)
    actions.pause(random.uniform(0.5, 1.5))
    actions.click()
    actions.pause(random.uniform(0.5, 1.5))
    actions.send_keys("password")
    
    # 获取登录按钮
    element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//button[@class="el-button handle-btn el-button--default"]/span[text()="登录"]'))
    )
    
    actions.move_to_element(element)
    actions.pause(random.uniform(0.5, 1.5))
    actions.click()
    #
    
    actions.perform()
    time.sleep(2)

    # 获取页面标题
    page_title = driver.title
    print(f"页面标题: {page_title}")

    # 打印页面内容（可选）
    driver.get("https://")
    #print(driver.page_source)

    # 等待几秒钟以便观察页面
    time.sleep(5)
    
    for _ in range(1):  # 假设需要翻5页
        # 执行页面操作，例如抓取数据
        # ...
    
        # 找到“下一页”按钮并点击
       # 使用 XPath 表达式获取按钮元素
        button = driver.find_element(By.XPATH, "//button[@class='btn-next' and span[text()='下一页']]")
    
        # 点击按钮
        button.click()
    
        # 等待一段时间，以便观察效果
        time.sleep(3)
        
        print("aaaaaaaaaaaaaaaaaaaaaaa")

        #print(driver.page_source)
        
        # 获取页面的 Cookies
        cookies = driver.get_cookies()
        
        # 将 Selenium 获取的 Cookies 转换为 requests 可以使用的格式
        cookie_dict = {cookie['name']: cookie['value'] for cookie in cookies}
        
        # 获取页面的请求头
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
            'Accept-Language': 'en-US,en;q=0.9',
            'Content-Type': 'application/x-www-form-urlencoded',  # 根据实际情况调整
            'Referer': 'https://www.qianlima.com/'  # 根据实际情况调整
        }
    
        ahtml = driver.page_source
        # 使用正则表达式提取 data-cid 的值
        pattern = r'data-cid="(\d+)"'
        matches = re.findall(pattern, ahtml)
        
        # 输出提取到的 data-cid 值
        for cid in matches:
            print(cid)
        
            aurl = 'https://www.qianlima.com/bid-' + cid + '.html'
            print(aurl)
            ajsonurl = 'https://detail.vip.qianlima.com/rest/detail/alltypesdetail/detail/' + cid
            print(ajsonurl)
            

            post_data = {
                'psw': '',
                'username': '0'
            }
            
            # 发送 POST 请求
            response = requests.post(ajsonurl, cookies=cookie_dict, headers=headers)
            
            # 检查响应状态码
            if response.status_code == 200:
                print('POST 请求成功')
                print('响应内容:', response.text)
                # 解析 JSON 字符串
                data = json.loads(response.text)
                print('开始解析结果')
                # 遍历 JSON 对象并输出每个字段的值
                def print_json_values(data, prefix=""):
                    if isinstance(data, dict):
                        for key, value in data.items():
                            new_prefix = f"{prefix}.{key}" if prefix else key
                            print_json_values(value, new_prefix)
                    elif isinstance(data, list):
                        for index, value in enumerate(data):
                            new_prefix = f"{prefix}[{index}]"
                            print_json_values(value, new_prefix)
                    else:
                        print(f"{prefix}: {data}")
                
                print_json_values(data)
                
                #将结果存入数据库
                
                print('结束解析结果')
            else:
                print('POST 请求失败，状态码:', response.status_code)
                print('响应内容:', response.text)
            


            
    
except Exception as e:
    print(f"发生错误: {e}")
#finally:
    # 关闭浏览器
   # driver.quit
   # server.stop

chromedriver.exe

上一篇：5个APP自动化测试辅助定位工具，你用过几个?
下一篇：更简单地自动化操作浏览器

python爬虫

相关推荐

超强!批量修改文件名工具，一次处理上万个，支持各种网盘!

详解什么是BT种子、迅雷下载链接、磁力链接

potplayer在线字幕翻译插件——potplayer播放器功能插件推荐

P106/104系列显卡安装魔改驱动步骤

佳能R62、R7、R10解决ERR70问题的新固件来了，附详细升级方法!

麒麟桌面操作系统如何安装佳能打印机驱动程序(图文)

Gopeed，全平台多线程高速下载器，支持磁力BT下载，跑满宽带

【Steam防坑指南】:教你识破“假入库”骗局!保住钱包和账号!

磁力链接bt链接转直链IDM下载

方正兰亭黑系列/汉仪旗黑家族/思源黑体介绍与区别