API 串接：公開資訊觀測站 - 董監事持股明細 - 程式碼 - 【教材專區】Python網路爬蟲工作坊｜金融應用篇 - Cupoy

爬蟲目標：公開資訊觀測站 - 董監事持股明細我們希望能取得每間上市櫃公司每月的董監事名單和持股數，人物跨公司的關係和持股數的變化，用來做對股價變化或投資標的選擇的運用。頁面連結：https://...

爬蟲目標：公開資訊觀測站 - 董監事持股明細我們希望能取得每間上市櫃公司每月的董監事名單和持股數，人物跨公司的關係和持股數的變化，用來做對股價變化或投資標的選擇的運用。頁面連結：https://mops.twse.com.tw/mops/web/stapap1 API: https://mops.twse.com.tw/mops/web/ajax_stapap1 資料單位：每月、每間公司，會有一份名單和持股數的表格載入套件 from bs4 import BeautifulSoup # 網頁解析 import datetime as dt import json import numpy as np import pandas as pd import requests # 發送 requests from time import sleep # 暫停 from tqdm import tqdm # progress 進度條 import warnings warnings.filterwarnings("ignore") 設定變數 headers = { "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Mobile Safari/537.36", "Content-Type": "application/x-www-form-urlencoded" } url = 'https://mops.twse.com.tw/mops/web/ajax_stapap1' stock_id = 1102 year = 110 month = 11 payload = { 'encodeURIComponent': '1', 'step': '1', 'firstin': '1', 'off': '1', 'keyword4': '', 'code1': '', 'TYPEK2': '', 'checkbtn': '', 'queryName': 'co_id', 'inpuType': 'co_id', 'TYPEK': 'all', 'isnew': 'false', 'co_id': str(stock_id), 'year': str(year), 'month': str(month).zfill(2) } 發 POST Requests 測試取得資料 # 發送 Requests res = requests.post(url, data=payload, headers=headers).content # 解析網頁 soup = BeautifulSoup(res, 'html.parser') # 捕捉所需的資料 shareholdings_records = [ [i.text.strip() for i in item.find_all('td')] for item in soup.find('table', {'class': 'hasBorder'}).find_all('tr') ] print(shareholdings_records) 作法1: 處理 list 中的欄位再轉換 DataFrame # 欄位處理 pre = shareholdings_records[0][-1] # 配偶、未成年子女及利用他人名義持有部份 total_cols = shareholdings_records[0][:-1] + [f"{pre}-{col}" for col in shareholdings_records[1]] shareholdings_records[0] = total_cols shareholdings_records.pop(1) # 將資料轉成 DataFrame cols = shareholdings_records[0] data = shareholdings_records[1:] df = pd.DataFrame(data, columns=cols) df = df[(df['職稱']!="職稱") & (df['目前持股'].notnull())].reset_index(drop=True) df 作法2: 以 pd.read_html 轉換為 DataFrame 再處理 table = soup.find('table', {'class': 'hasBorder'}) df = pd.read_html(str(table))[0] merge_headers = (df.iloc[0, -3:] + "-" + df.iloc[1, -3:]).values.tolist() df.columns = df.iloc[0, :-3].values.tolist() + merge_headers df = df.iloc[2:, :].reset_index(drop=True) df = df[df['職稱']!="職稱"].reset_index(drop=True) df 將爬取程式寫為 function 便於呼叫 def getShareHoldings(stock_id, year, month): """ 要送入的參數： - 公司代碼 stock_id - 年份 year - 月份 month """ payload['co_id'] = stock_id payload['year'] = year payload['month'] = month # 發 requests res = requests.post(url, data=payload, headers=headers).content # 網頁解析 soup = BeautifulSoup(res, 'html.parser') # 抓取想要的資料，拼成所需格式 shareholdings_records = [ [i.text.strip() for i in item.find_all('td')] for item in soup.find('table', {'class': 'hasBorder'}).find_all('tr') ] # 欄位處理 pre = shareholdings_records[0][-1] total_cols = shareholdings_records[0][:-1] + [f"{pre}-{col}" for col in shareholdings_records[1]] shareholdings_records[0] = total_cols shareholdings_records.pop(1) return shareholdings_records # 取得 2330 110年 1月資料 shareholdings_records = getShareHoldings( stock_id=2330, year=110, month=1 ) df = pd.DataFrame(shareholdings_records[1:], columns=shareholdings_records[0]) df.head() 載入台股上市的公司代碼列表 listed_company.csv stock_ids = pd.read_csv('https://raw.githubusercontent.com/A-baoYang/Crawlers/jupyter_gcp_cathayddt/Financial/stock/listed_company.csv', delimiter="　") stock_ids.columns = ["id", "name"] stock_id_list = stock_ids["id"].unique().tolist() print(stock_ids.shape) stock_ids 練習 1 取得「雄獅」公司 109 年度 5 月的董監事持股明細以 .csv 格式儲存stock_id = 2731 year = 109 month = 5 # shareholdings_records = getShareHoldings( # stock_id=stock_id, year=year, month=month # ) # shareholdings_records[:5] df = pd.DataFrame(shareholdings_records[1:], columns=shareholdings_records[0]) df.to_csv(f"{stock_id}-sharehold-{year}-{month}.csv", index=False) 練習 2 取得「玉山金」公司 108 年度 7-12 月的董監事持股明細以 .csv 格式儲存 stock_ids[stock_ids["name"].str.contains("玉山金")] for number in range(10): if number == 5: break # continue here print('Number is ' + str(number)) stock_id = 2884 year = 108 broken_records = [] for month in range(7, 13): try: print(f"Crawling {stock_id} {year} {month}") sleep(3) shareholdings_records = getShareHoldings( stock_id=stock_id, year=year, month=month ) df = pd.DataFrame(shareholdings_records[1:], columns=shareholdings_records[0]) df.to_csv(f"{stock_id}-{year}{str(month).zfill(2)}.csv", index=False) except Exception as err: print(f"At {stock_id} {year} {month}") print(err) broken_records.append((stock_id, year, month)) 練習 3 取一部分公司列表示範，爬取董監事持股資料，並以 .csv 格式儲存取列表前10間公司取得 109-110 年度各個月份的董監事持股明細 (2020-2021)stock_id_list[:10] for stock_id in tqdm(stock_id_list[:10]): for year in range(109, 111): for month in range(1, 13): try: print(f"Crawling {stock_id} {year} {month}") sleep(3) shareholdings_records = getShareHoldings( stock_id=stock_id, year=year, month=month ) df = pd.DataFrame(shareholdings_records[1:], columns=shareholdings_records[0]) df.to_csv(f"{stock_id}-{year}{str(month).zfill(2)}.csv", index=False) except Exception as err: print(f"At {stock_id} {year} {month}") print(err) pass