import requests
base_url = "https://www.ptt.cc"
home_url = "/bbs/movie/index.html"
response = requests.get(base_url + home_url)
response
status_code = response.status_code
html = response.text
print(status_code)
print(len(html))
title_pos1 = html.find('<title>')
title_pos2 = html.find('</title>', title_pos1)
print(html[title_pos1 : title_pos2+8])
a_pos1 = html.find('<a')
a_pos1 = html.find('>', a_pos1)
a_pos2 = html.find('</a>', a_pos1)
while a_pos1 != -1 and a_pos2 != -1:
print(html[a_pos1+1 : a_pos2])
a_pos1 = html.find('<a', a_pos1)
a_pos1 = html.find('>', a_pos1)
a_pos2 = html.find('</a>', a_pos1)
resp = requests.get("https://www.ptt.cc/bbs/Gossiping/index.html")
resp.url
resp.status_code
resp.history
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
type(soup)
title_tag = soup.find('title')
title_tag.text
a_tags = soup.find_all('a')
a_tags[13]
type(a_tags[0])
hrefs = []
for a_tag in a_tags:
hrefs.append(a_tag.get('href', ''))
hrefs[:5]
post_tag = soup.find('div', attrs = {'class':'r-ent'})
post_tag
post_tag.find('a')
post_tag.find('span')
resp = requests.get('https://www.ptt.cc/bbs/Gossiping/index.html')
resp.url
resp.history
resp = requests.post('https://www.ptt.cc/ask/over18', {
'yes' : 'yes',
'from' : '/bbs/Gossiping/index.html',
})
resp.url
resp.status_code
resp = requests.get('https://www.ptt.cc/bbs/Gossiping/index.html')
print(resp.url)
resp = requests.post('https://www.ptt.cc/ask/over18', {
'yes' : 'yes',
'from' : '/bbs/Gossiping/index.html',
})
prev_resp = resp.history[0]
cookie = prev_resp.cookies
resp = requests.get('https://www.ptt.cc/bbs/Gossiping/index.html',
cookies=cookie)
print(resp.url)
print(resp.status_code)
resp = requests.get('https://www.ptt.cc/bbs/Gossiping/index.html',
cookies={'over18':'1'})
print(resp.url)
print(resp.status_code)