抖音网页版高清视频抓取教程selenium
废话不多说,直接上代码
from selenium import webdriver
from selenium.webdriver import ChromeOptions
import time
import re
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import uuid
import os
import requests
option = ChromeOptions()
option.add_argument(
"user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36""
)
option.add_experimental_option("excludeSwitches", ["enable-automation"]) #防止系统检测到自动化工具
option.add_experimental_option("useAutomationExtension", False)
browser = webdriver.Chrome(options=option)
browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": "Object.defineProperty(navigator, "webdriver", {get: () => undefined})"
})
browser.maximize_window()#页面最大化
def douyincrawler(keyword):
url = "https://www.douyin.com/search/"+keyword+"?publish_time=0&sort_type=0&source=switch_tab&type=video"
browser.get(url)
browser.find_element_by_xpath("//*[@id="qdblhsHs"]/button").click() #点击登陆用抖音手机app扫码登陆
time.sleep(15) #设置等待时间扫码登陆
for x in range(5):#自动下拉
time.sleep(5)
js_bottom = "var q=document.documentElement.scrollTop=10000"
browser.execute_script(js_bottom)
if "服务出现异常" in browser.page_source: #刷新页面
browser.refresh()
if "服务异常,重新" in browser.page_source:
browser.find_element_by_xpath("//*[@id="dark"]/div[2]/div/div[3]/div[2]/div/div/span").click() #点击加载
detail_url_lists = browser.find_elements_by_xpath("//*[@id="dark"]/div[2]/div/div[3]/div[2]/ul/li/div/div/a[1]")# 获取页面所有详情url
print("共计侦查到{}个视频数据".format(len(detail_url_lists)))
for i in detail_url_lists:
try:
browser.execute_script("arguments[0].click();", i) #防止页面有该元素却无法点击问题出现
ws = browser.window_handles #获取所有窗口
browser.switch_to.window(ws[1]) #切换新句柄
WebDriverWait(browser, 10).until(EC.presence_of_element_located((
By.XPATH, "//*[@id="root"]/div/div[2]/div/div/div[1]/div[1]/div[2]/div/div[1]/div/div[2]/div[2]/xg-video-container/video"
))) #显示等待视频标签出现
video_url = "https:" + re.findall(r"<source class="" src="(.*?)"", browser.page_source)[0] # 正则获取视频链接
savevideo(video_url)
browser.close() #关闭当前窗口
browser.switch_to.window(ws[0]) #切回主页面这一步很关键
except Exception as e:
print(e)
def savevideo(video_url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36",
}
video_dir = r"C:UserslvyeDesktopdou_yinvideo"
video_full_path = os.path.join(video_dir,str(uuid.uuid4()) + ".mp4")
response = requests.get(url=video_url,headers=headers)
with open(video_full_path,"wb")as f:
f.write(response.content)
print("已下载:{}".format(video_url))
if __name__ == "__main__":
douyincrawler("街拍美女")


