動態爬蟲：MoneyDJ 基金基本資料 - 程式碼 - 【教材專區】Python網路爬蟲工作坊｜金融應用篇 - Cupoy

Workshop #3 MoneyDJ 基金基本資料爬蟲目標：將 MoneyDJ 基金頁面上的每支基金資本基料及持股明細爬取下來 # 套件安裝 !pip install fake-useragent...

Workshop #3 MoneyDJ 基金基本資料爬蟲目標：將 MoneyDJ 基金頁面上的每支基金資本基料及持股明細爬取下來 # 套件安裝 !pip install fake-useragent selenium webdriver-manager # 載入所需套件 from fake_useragent import UserAgent import json import numpy as np import pandas as pd from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait from time import sleep from tqdm import tqdm from webdriver_manager.chrome import ChromeDriverManager driver = webdriver.Chrome(ChromeDriverManager().install()) 運行下載 chromedriver 的指令後會看到以下訊息，告訴我們最新版本存在什麼地方這就是 webdriver 的啟動位址 ====== WebDriver manager ====== Current google-chrome version is 102.0.5005 Get LATEST chromedriver version for 102.0.5005 google-chrome Trying to download new driver from https://chromedriver.storage.googleapis.com/102.0.5005.61/chromedriver_mac64.zip Driver has been saved in cache [/Users/jiunyiyang/.wdm/drivers/chromedriver/mac64/102.0.5005.61] # 設定 webdriver 的啟動位址 chrome_path = "/Users/jiunyiyang/.wdm/drivers/chromedriver/mac64/102.0.5005.61/chromedriver" # 設定 User Agent ua = UserAgent() opt = webdriver.ChromeOptions() opt.add_argument("--user-agent=%s" % ua.random) 取得基金列表載入網頁後，從各個基金分類列表獲取所有基金的名稱和網址列表 url = "https://www.moneydj.com/funddj/yb/YP301000.djhtm" # 初始化 webdrvier driver = webdriver.Chrome(executable_path=chrome_path, options=opt) driver.set_window_size(1024, 850) driver.get(url) # 蒐集所有基金分類 fund_types = [(item.text, item.get_attribute('href')) for item in driver.find_elements_by_xpath('//div[@class="InternalSearch"]//a')] print(fund_types[:5]) # 進入第一個基金分類選單頁(指數型) driver.get(fund_types[0][1]) # 獲取基金列表 fund_company = [(item.text, item.get_attribute('href')) for item in driver.find_elements_by_xpath('//table[@id="oMainTable"]//td/a') if len(item.text.strip())] print(fund_company[0::2][:5]) # 依照上面步驟，遍歷各個基金種類 fund_company_list = list() for item in tqdm(fund_types): driver.get(item[1]) fund_company = [(item.text, item.get_attribute('href')) for item in driver.find_elements_by_xpath('//table[@id="oMainTable"]//td/a') if len(item.text.strip())] fund_list = fund_company[0::2] company_list = [item[0] for item in fund_company[1::2]] for i in range(len(fund_list)): fund_company_list += [(company_list[i], fund_list[i][0], fund_list[i][1])] sleep(np.random.randint(3, 7)) driver.quit() pd.DataFrame( fund_company_list, columns=["company", "fund_name", "fund_url"] ).head() # store to json file fund_company_list = {"list": fund_company_list} with open("fund_company_dict.json", "w", encoding="utf-8") as f: json.dump(fund_company_list, f, ensure_ascii=False, indent=4) fund_company_list = fund_company_list["list"] 取得基金資訊根據剛才蒐集到的每一支基金的網址，蒐集其基本資訊 fund_id = fund_company_list[888][2].split("a=")[-1] print(fund_id) info_url = "https://www.moneydj.com/funddj/yp/yp011000.djhtm?a=%s" % (fund_id) perf_url = "https://www.moneydj.com/funddj/yp/yp012000.djhtm?a=%s" % (fund_id) 使用「等待物件再抓取」的判斷邏輯，避免因為資料還沒載入造成爬蟲錯誤 driver = webdriver.Chrome(executable_path=chrome_path, options=opt) driver.set_window_size(1024, 850) driver.get(info_url) WAIT_SECONDS = 3 # 等待物件出現，未出現超過 3 秒才會出錯 info_col = WebDriverWait(driver, WAIT_SECONDS).until( # 檢測資訊表格物件是否出現在頁面上 EC.presence_of_element_located( ( # 使用 XPath 定位物件 By.XPATH, '//table[@class="t04"]//td[@class="t2c1"]', ) ) ) # 取得基本資料表格的欄位 info_col = [ item.text for item in driver.find_elements_by_xpath( '//table[@class="t04"]//td[@class="t2" or @class="t2c1"]' ) ] # 取得基本資料表格的值 info_val = [ item.text for item in driver.find_elements_by_xpath( '//table[@class="t04"]//td[@class="t3t2"]' ) ] print(info_col[:5]) print(info_val[:5]) # 將邏輯寫成 function 用來遍歷所有基金 def get_info_table(info_url): driver.get(info_url) # 等待物件出現，未出現超過 3 秒才會出錯 info_col = WebDriverWait(driver, WAIT_SECONDS).until( # 檢測資訊表格物件是否出現在頁面上 EC.presence_of_element_located( ( # 使用 XPath 定位物件 By.XPATH, '//table[@class="t04"]//td[@class="t2c1"]', ) ) ) info_col = [ item.text for item in driver.find_elements_by_xpath( '//table[@class="t04"]//td[@class="t2" or @class="t2c1"]' ) ] info_val = [ item.text for item in driver.find_elements_by_xpath( '//table[@class="t04"]//td[@class="t3t2"]' ) ] return info_col, info_val 取得基金績效表根據剛才蒐集到的每一支基金的網址，蒐集其績效 driver.get(perf_url) # 等待物件出現，未出現超過 3 秒才會出錯 perf_table = WebDriverWait(driver, WAIT_SECONDS).until( # 等待物件出現，未出現超過 3 秒才會出錯 EC.presence_of_element_located( # 檢測績效表格物件是否出現在頁面上 (By.XPATH, '//table[@class="t01"]') ) ) # 抓取績效表欄位 perf_col = [ item.text.replace("\n","") for item in driver.find_elements_by_xpath('//table[@class="t01"]')[0] .find_elements_by_xpath('.//td[contains(@class, "t2")]') ] # 抓取績效表的值 perf_val = [ item.text for item in driver.find_elements_by_xpath('//table[@class="t01"]')[0] .find_elements_by_xpath('.//td[contains(@class, "t3")]') ] pd.DataFrame([perf_val], columns=perf_col) # 將邏輯寫成 function 用來遍歷所有基金 def get_perf_table(perf_url): driver.get(perf_url) # 等待物件出現，未出現超過 3 秒才會出錯 perf_table = WebDriverWait(driver, WAIT_SECONDS).until( # 等待物件出現，未出現超過 3 秒才會出錯 EC.presence_of_element_located( # 檢測績效表格物件是否出現在頁面上 (By.XPATH, '//table[@class="t01"]') ) ) # 抓取績效表欄位 perf_col = [ item.text.replace("\n","") for item in driver.find_elements_by_xpath('//table[@class="t01"]')[0] .find_elements_by_xpath('.//td[contains(@class, "t2")]') ] # 抓取績效表的值 perf_val = [ item.text for item in driver.find_elements_by_xpath('//table[@class="t01"]')[0] .find_elements_by_xpath('.//td[contains(@class, "t3")]') ] return perf_col, perf_val 遍歷所有基金 # 載入預存好的基金列表 (id 及 url) # load fund item list with open("fund_company_dict.json", "r", encoding="utf-8") as f: fund_company_dict = json.load(f) fund_company_list = fund_company_dict["list"] len(fund_company_list) #driver = webdriver.Chrome(executable_path=chrome_path, options=opt) #driver.set_window_size(1024, 850) WAIT_SECONDS = 3 fund_info_dict = dict() for fund in tqdm(fund_company_list): fund_name = fund[1] fund_id = fund[2].split("a=")[-1] fund_info_dict.update({fund_id: dict()}) fund_info_dict[fund_id].update({"基金名稱": fund_name}) info_url = "https://www.moneydj.com/funddj/yp/yp011000.djhtm?a=%s" % (fund_id) perf_url = "https://www.moneydj.com/funddj/yp/yp012000.djhtm?a=%s" % (fund_id) try: # 基本資料 info_col, info_val = get_info_table(info_url=info_url) # 績效表 perf_col, perf_val = get_perf_table(perf_url=perf_url) except Exception as e: info_col, info_val = [], [] perf_col, perf_val = [], [] finally: fund_info_dict[fund_id].update({ "基本資料": dict(zip(info_col, info_val)), "績效": dict(zip(perf_col, perf_val)), }) # temp store to json file with open("fund_info_dict.json", "w", encoding="utf-8") as f: json.dump(fund_info_dict, f, ensure_ascii=False, indent=4) sleep(np.random.randint(3, 7)) driver.quit() # store to json file with open("fund_info_dict.json", "w", encoding="utf-8") as f: json.dump(fund_info_dict, f, ensure_ascii=False, indent=4) 📎fund_info_dict.json📎fund_company_dict.json