2015年7月24日 星期五

python爬蟲



最近在練習python爬蟲, 以下是對岸用正規表示法寫的
http://cuiqingcai.com/1001.html
Python爬虫实战四之抓取淘宝MM照片-->我不是為了標題才改code的哦!
然後又看到大數學堂使用BeautifulSoup去爬ptt,那我來改寫對岸寫的東西

import requests
from bs4 import BeautifulSoup
res = requests.get('http://mm.taobao.com/json/request_top_list.htm',verify=False)
soup = BeautifulSoup(res.text)
for entry in soup.select('.list-item'):
    print entry.select('.lady-name')[0].text,entry.select('strong')[0].text+,entry.select('span')[0].text

結果如下圖


太簡單啦~正規表示法我還認真的看了半天才知道在做什麼
BeatifulSoup真是好東西

用re正規表示法也可以,只是麻煩了點
import urllib
import urllib2
import re

url = 'http://mm.taobao.com/json/request_top_list.htm'
try:
    request = urllib2.Request(url)
    response = urllib2.urlopen(request)
    content = response.read().decode('gbk')
    pattern = re.compile('<div class="list-item".*?<a class="lady-name.*?>(.*?)</a>.*?<strong>(.*?)</strong>.*?<span>(.*?)</span>',re.S)
    items = re.findall(pattern,content)
    for item in items:
        print item[0],item[1],item[2]
     
except urllib2.URLError, e:
    if hasattr(e,"code"):
        print e.code
    if hasattr(e,"reason"):
        print e.reason


結果如下圖

     
如同以前自控老師說,假設成功殊途同歸!!!

2015年7月23日 星期四

抓三大法人的網頁資料



抓三大法人統計表,出處來自大數學堂http://course.largitdata.com/course/31/
只能說david是神,有機會看到要拜

第一步是抓三大法人的網頁內容
import requests
res = requests.get('http://www.twse.com.tw/ch/trading/fund/BFI82U/BFI82U.php?report1=day&input_date=104%2F07%2F22&mSubmit=%ACd%B8%DF&yr=2015&w_date=20150720&m_date=20150701')
res.encoding = 'big5'
print res.text

beautifulsoup分析
import requests
from bs4 import BeautifulSoup
res = requests.get('http://www.twse.com.tw/ch/trading/fund/BFI82U/BFI82U.php?report1=day&input_date=104%2F07%2F22&mSubmit=%ACd%B8%DF&yr=2015&w_date=20150720&m_date=20150701')
res.encoding = 'big5'
soup = BeautifulSoup (res.text)
print soup.select('.board_trad')

只抓tr裡面的td
import requests
from bs4 import BeautifulSoup
res = requests.get('http://www.twse.com.tw/ch/trading/fund/BFI82U/BFI82U.php?report1=day&input_date=104%2F07%2F22&mSubmit=%ACd%B8%DF&yr=2015&w_date=20150720&m_date=20150701')
res.encoding = 'big5'
soup = BeautifulSoup (res.text)
for tr in soup.select('.board_trad tr')[2:]:
     for td in tr.select('td'):
        print td.text
最後的整理
import requests
from bs4 import BeautifulSoup
res = requests.get('http://www.twse.com.tw/ch/trading/fund/BFI82U/BFI82U.php?report1=day&input_date=104%2F07%2F22&mSubmit=%ACd%B8%DF&yr=2015&w_date=20150720&m_date=20150701')
res.encoding = 'big5'
soup = BeautifulSoup (res.text)
for tr in soup.select('.board_trad tr')[2:]:
    td = tr.select('td')
    print td[0].text,td[1].text,td[2].text,td[3].text

時間str是整理成字串
from datetime import date,timedelta
today = date.today()
for i in range(1,10):
    today = today + timedelta(days=-1)
    print str(today).split('-')

設定多組的時間,'-'.join代表插入'-'且只顯示括號內的值
from datetime import date,timedelta
today = date.today()
for i in range(1,10):
    today = today + timedelta(days=-1)
    dayary = str(today).split('-')
    print '-'.join([str(int(dayary[0]) - 1911), dayary[1],dayary[2]])

抓取多天的數據
import requests
from bs4 import BeautifulSoup
from datetime import date,timedelta
#res = requests.get('http://www.twse.com.tw/ch/trading/fund/BFI82U/BFI82U.php?report1=day&input_date=104%2F07%2F22&mSubmit=%ACd%B8%DF&yr=2015&w_date=20150720&m_date=20150701')
url = 'http://www.twse.com.tw/ch/trading/fund/BFI82U/BFI82U.php?report1=day&input_date={0}&mSubmit=%ACd%B8%DF&yr=2015&w_date=20150720&m_date=20150701'
def getTradeValue(dt):
    res = requests.get(url.format(dt))
    res.encoding = 'big5'
    soup = BeautifulSoup (res.text)
    for tr in soup.select('.board_trad tr')[2:]:
        td = tr.select('td')
        print td[0].text,td[1].text,td[2].text,td[3].text
       

today = date.today()
for i in range(1,10):
    today = today + timedelta(days=-1)
    dayary = str(today).split('-')
    dt = '%2F'.join([str(int(dayary[0]) - 1911), dayary[1],dayary[2]])
    getTradeValue(dt)

加上日期的資訊
import requests
from bs4 import BeautifulSoup
from datetime import date,timedelta
#res = requests.get('http://www.twse.com.tw/ch/trading/fund/BFI82U/BFI82U.php?report1=day&input_date=104%2F07%2F22&mSubmit=%ACd%B8%DF&yr=2015&w_date=20150720&m_date=20150701')
url = 'http://www.twse.com.tw/ch/trading/fund/BFI82U/BFI82U.php?report1=day&input_date={0}&mSubmit=%ACd%B8%DF&yr=2015&w_date=20150720&m_date=20150701'
def money_conversion(input_ele):
    return int(''.join(input_ele.split(',')))

def getTradeValue(today):
    dayary = str(today).split('-')
    dt = '%2F'.join([str(int(dayary[0]) - 1911), dayary[1],dayary[2]])
    res = requests.get(url.format(dt))
    res.encoding = 'big5'
    soup = BeautifulSoup (res.text)
    for tr in soup.select('.board_trad tr')[2:]:
        td = tr.select('td')
        print td[0].text, money_conversion(td[1].text),money_conversion(td[2].text),money_conversion(td[3].text),today
       

today = date.today()
for i in range(1,10):
    today = today + timedelta(days=-1)

    getTradeValue(today)

sqlite存到檔案,檔案要寫絕對路徑
import sqlite3 as lite
con = lite.connect('c:/finace.sqlite')
cur = con.cursor()
cur.execute("select * from InvestorTradingValue")
ret = cur.fetchone()
print ret
#cur.execute("insert into InvestorTradingValue(item,total_but,total_sell,difference,date) values('foreign',20,30,-10,'2013-05-05')")
#con.commit()
con.close()   

存檔資料及取檔資料(mark處是取檔資料)
import sqlite3 as lite
con = lite.connect('c:/finace.sqlite')
cur = con.cursor()
cur.execute("select * from InvestorTradingValue")
ret = cur.fetchone()
print ret
#cur.execute("insert into InvestorTradingValue(item,total_buy,total_sell,difference,date) values('foreign',20,30,-10,'2013-05-05')")
#con.commit()
con.close()   

存到sqlite
import requests
from bs4 import BeautifulSoup
from datetime import date,timedelta
import sqlite3 as lite

#cur.execute("insert into InvestorTradingValue(item,total_buy,totol_sell,difference,date) values('foreign',20,30,-10,'2013-05-05')")
#con.commit()
con.close()   
#res = requests.get('http://www.twse.com.tw/ch/trading/fund/BFI82U/BFI82U.php?report1=day&input_date=104%2F07%2F22&mSubmit=%ACd%B8%DF&yr=2015&w_date=20150720&m_date=20150701')
url = 'http://www.twse.com.tw/ch/trading/fund/BFI82U/BFI82U.php?report1=day&input_date={0}&mSubmit=%ACd%B8%DF&yr=2015&w_date=20150720&m_date=20150701'
sql = "insert into InvestorTradingValue(item,total_buy,totol_sell,difference,date) values(?,?,?,?,?)"
def money_conversion(input_ele):
    return int(''.join(input_ele.split(',')))

def getTradeValue(cur,today):
    dayary = str(today).split('-')
    dt = '%2F'.join([str(int(dayary[0]) - 1911), dayary[1],dayary[2]])
    res = requests.get(url.format(dt))
    res.encoding = 'big5'
    soup = BeautifulSoup (res.text)
    for tr in soup.select('.board_trad tr')[2:]:
        td = tr.select('td')
        ret = [td[0].text, money_conversion(td[1].text),money_conversion(td[2].text),money_conversion(td[3].text),today]
        cur.execute(sql,ret)
con = lite.connect('c:/finace.sqlite')
cur = con.cursor()


today = date.today()
for i in range(1,10):
    today = today + timedelta(days=-1)
    getTradeValue(cur,today)
con.commit()   
con.close()    

結果如下

FB設定搶先看的方式

設定搶先看的兩種方式 A1. 先到我家的日常粉絲團按下 …( 紅框處 ) A2. 按下追蹤中 ( 紅框處 ) A3. 按下搶先看 ( 紅框處 ) A4. 完成!!! 另一種方式 ...