Dev metacog/DevCaJournal
web crawl 개발일지
잘 배우고, 잘 익히기
2021. 9. 15. 23:52
ㅇ 핵심 기능 : openpyxl, append, 웹브라우저의 검사 기능과 엘리먼트 파악
- thumbnail = article.select_one('a > img')['src']
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome('chromedriver')
from openpyxl import Workbook
wb = Workbook()
ws1 = wb.active
ws1.title = "articles"
ws1.append(["제목", "링크", "신문사", "썸네일"])
url = "https://search.naver.com/search.naver?&where=news&query=추석"
driver.get(url)
req = driver.page_source
soup = BeautifulSoup(req, 'html.parser')
##################################
# 각 요소 크롤링해서 엑셀에 붙여넣기
##################################
articles = soup.select('#main_pack > section.sc_new.sp_nnews._prs_nws > div > div.group_news > ul > li')
for article in articles:
title = article.select_one('li > div > div > a').text
url = article.select_one('li > div > div > a')['href']
press = article.select_one('div.news_wrap.api_ani_send > div > div.news_info > div.info_group > a.info.press').text.split(' ')[0].replace('언론사','')
thumbnail = article.select_one('a > img')['src']
ws1.append([title, url, press, thumbnail])
driver.quit()
wb.save(filename='articles.xlsx')