requests.get(網址)¶

import requests

base_url = "https://www.ptt.cc"
home_url = "/bbs/movie/index.html"

response = requests.get(base_url + home_url)

response

<Response [200]>

status_code = response.status_code
html = response.text
print(status_code)
print(len(html))

200
17205

find html¶

title_pos1 = html.find('<title>')
title_pos2 = html.find('</title>', title_pos1)
print(html[title_pos1 : title_pos2+8])

<title>看板 movie 文章列表 - 批踢踢實業坊</title>

a_pos1 = html.find('<a')
a_pos1 = html.find('>', a_pos1)
a_pos2 = html.find('</a>', a_pos1)
while a_pos1 != -1 and a_pos2 != -1:
    print(html[a_pos1+1 : a_pos2])
    a_pos1 = html.find('<a', a_pos1)
    a_pos1 = html.find('>', a_pos1)
    a_pos2 = html.find('</a>', a_pos1)

批踢踢實業坊
<span class="board-label">看板 </span>movie
關於我們
聯絡資訊
看板
精華區
最舊
&lsaquo; 上頁
下頁 &rsaquo;
最新
[普好雷]侏羅紀世界：殞落國度 
搜尋同標題文章
搜尋看板內 dive6 的文章
[請益] 分裂 某個醫生的橋段 有雷
搜尋同標題文章
搜尋看板內 NEDYA 的文章
Re: [吵架雷] 瞞天過海8面玲瓏
搜尋同標題文章
搜尋看板內 kj1020 的文章
[片單] 不落俗套的題材
搜尋同標題文章
搜尋看板內 kenshiloh 的文章
[好雷] 瞞天過海：八面玲瓏
搜尋同標題文章
搜尋看板內 b81314 的文章
[好雷] 看不見的台灣，最熟悉又最陌生的宮廟
搜尋同標題文章
搜尋看板內 kmtrash 的文章
[雷] 生靈勿進｜穿梭於生死兩界，無形與有形之間。
搜尋同標題文章
搜尋看板內 icemake121 的文章
[好微雷] 瞞天過海：八面玲瓏 
搜尋同標題文章
搜尋看板內 Nisio 的文章
[請益] 會特別去收集電影海報嗎?怎麼處理保存呢?
搜尋同標題文章
搜尋看板內 pipiboygay 的文章
[片單] 嚇人有創意的鬼片
搜尋同標題文章
搜尋看板內 darklunacist 的文章
[普雷] 侏羅紀世界：吐槽國度 
搜尋同標題文章
搜尋看板內 b81314 的文章
[普負雷] 《沒人愛小姐》，改變自己從改變想法開
搜尋同標題文章
搜尋看板內 a122239 的文章
[好雷] 侏儸紀世界 殞落國度 
搜尋同標題文章
搜尋看板內 fridaystory 的文章
[片單] 有沒有推崇專制反民主的電影？
搜尋同標題文章
搜尋看板內 ryanworld 的文章
[公告]《各式疑難雜症FAQ》
搜尋同標題文章
搜尋看板內 VOT1077 的文章
[公告] 板規！必看！｜好文推薦‧惡文檢舉
搜尋同標題文章
搜尋看板內 VOT1077 的文章
[公告] 勿再新貼(回文)谷阿莫起訴相關新聞 
搜尋同標題文章
搜尋看板內 cappa 的文章

requests 八卦板¶

resp = requests.get("https://www.ptt.cc/bbs/Gossiping/index.html")

resp.url

'https://www.ptt.cc/ask/over18?from=%2Fbbs%2FGossiping%2Findex.html'

resp.status_code

200

resp.history

[<Response [302]>]

美湯¶

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'html.parser')

type(soup)

bs4.BeautifulSoup

title_tag = soup.find('title')

title_tag.text

'看板 movie 文章列表 - 批踢踢實業坊'

a_tags = soup.find_all('a')

a_tags[13]

<a href="/bbs/movie/M.1529119928.A.BB2.html">[請益] 分裂 某個醫生的橋段 有雷</a>

type(a_tags[0])

bs4.element.Tag

hrefs = []
for a_tag in a_tags:
    hrefs.append(a_tag.get('href', ''))

hrefs[:5]

['/bbs/',
 '/bbs/movie/index.html',
 '/about.html',
 '/contact.html',
 '/bbs/movie/index.html']

post_tag = soup.find('div', attrs = {'class':'r-ent'})

post_tag

<div class="r-ent">
<div class="nrec"><span class="hl f2">3</span></div>
<div class="title">
<a href="/bbs/movie/M.1529118488.A.D23.html">[普好雷]侏羅紀世界：殞落國度 </a>
</div>
<div class="meta">
<div class="author">dive6</div>
<div class="article-menu">
<div class="trigger">⋯</div>
<div class="dropdown">
<div class="item"><a href="/bbs/movie/search?q=thread%3A%5B%E6%99%AE%E5%A5%BD%E9%9B%B7%5D%E4%BE%8F%E7%BE%85%E7%B4%80%E4%B8%96%E7%95%8C%EF%BC%9A%E6%AE%9E%E8%90%BD%E5%9C%8B%E5%BA%A6+">搜尋同標題文章</a></div>
<div class="item"><a href="/bbs/movie/search?q=author%3Adive6">搜尋看板內 dive6 的文章</a></div>
</div>
</div>
<div class="date"> 6/16</div>
<div class="mark"></div>
</div>
</div>

post_tag.find('a')

<a href="/bbs/movie/M.1528852696.A.FD6.html">[負雷] 古墓奇兵 ........</a>

post_tag.find('span')

<span class="hl f2">5</span>

Cookie¶

resp = requests.get('https://www.ptt.cc/bbs/Gossiping/index.html')

resp.url

'https://www.ptt.cc/ask/over18?from=%2Fbbs%2FGossiping%2Findex.html'

resp.history

[<Response [302]>]

requests.post(url, data)¶

resp = requests.post('https://www.ptt.cc/ask/over18', {
    'yes' : 'yes',
    'from' : '/bbs/Gossiping/index.html',
})

resp.url

'https://www.ptt.cc/bbs/Gossiping/index.html'

resp.status_code

200

cookie 不會保存¶

resp = requests.get('https://www.ptt.cc/bbs/Gossiping/index.html')
print(resp.url)

https://www.ptt.cc/ask/over18?from=%2Fbbs%2FGossiping%2Findex.html

自己保存 cookie¶

resp = requests.post('https://www.ptt.cc/ask/over18', {
    'yes' : 'yes',
    'from' : '/bbs/Gossiping/index.html',
})

prev_resp = resp.history[0]

cookie = prev_resp.cookies

resp = requests.get('https://www.ptt.cc/bbs/Gossiping/index.html',
                    cookies=cookie)

print(resp.url)
print(resp.status_code)

https://www.ptt.cc/bbs/Gossiping/index.html
200

自己做一個 cookie¶

resp = requests.get('https://www.ptt.cc/bbs/Gossiping/index.html',
                    cookies={'over18':'1'})

print(resp.url)
print(resp.status_code)

'https://www.ptt.cc/bbs/Gossiping/index.html'