1. 환경구성
conda create -n scraping python=3.10
conda activate scraping
pip install notebook
pip install bs4
pip install lxml
pip install scrapy
pip install pandas
pip install requests
pip install selenium
pip install webdriver-manager
2. 예제들
-- 쥬피터 노트북 이용
# wsl 에서 root 로 사용중이므로 옵션 붙임
jupyter notebook --allow-root
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen('http://www.pythonscraping.com/pages/page1.html')
bs = BeautifulSoup(html.read(), 'html.parser')
print(bs.h1)
-- vscode 이용
스크레피이 활용하기
scrapy startproject wikiSpider
# 단순 링크 추출
파일 작성 wikiSpider/wikiSpider/spiders/article.py
from scrapy import Spider, Request
class ArticleSpider(Spider):
name = 'article'
def start_requests(self):
urls = [
'http://en.wikipedia.org/wiki/Python_%28programming_language%29',
'https://en.wikipedia.org/wiki/Functional_programming',
'https://en.wikipedia.org/wiki/Monty_Python',
]
return [Request(url=url, callback=self.parse) for url in urls]
def parse(self, response):
url = response.url
title = response.css('h1::text').extract_first()
print(f"URL is: {url}")
print(f'Title is: {title}')
실행
# 실행 위치 /root/py-workspace/scraping/wikiSpider/wikiSpider
scrapy runspider spiders/article.py
# 컨텐츠 추출, 마지막 수정일 추출
파일작성 wikiSpider/wikiSpider/spiders/articles.py
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class ArticleSpider(CrawlSpider):
name = 'article'
allowed_domains = ['wikipedia.org']
start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life']
rules = [
Rule(
LinkExtractor(allow=r'.*'),
callback='parse_items',
follow=True
)
]
def parse_items(self, response):
url = response.url
title = response.css('span.mw-page-title-main::text').extract_first()
text = response.xpath('//div[@id="mw-content-text"]//text()').extract()
lastUpdated = response.css(
'li#footer-info-lastmod::text'
).extract_first()
lastUpdated = lastUpdated.replace('This page was last edited on ','')
print(f'URL is: {url}')
print(f'Title is: {title}')
print(f'Text is: {text}')
print(f'Last updated: {lastUpdated}')
# 실행
scrapy runspider wikiSpider/spiders/articles.py
# 각페이지를 article 로 구분해서 처리
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class ArticleSpider(CrawlSpider):
name = 'article'
allowed_domains = ['wikipedia.org']
start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life']
rules = [
Rule(
LinkExtractor(allow='(/wiki/)((?!:).)*$') ,
callback='parse_items',
cb_kwargs={'is_article': True} ## 콜백함수에 전달
),
Rule(
LinkExtractor(allow='.*'),
callback='parse_items', ## 페이지 내용 구문 분석
cb_kwargs={'is_article': False}
)
]
def parse_items(self, response, is_article):
print(response.url)
title = response.css('span.mw-page-title-main::text').extract_first()
if is_article:
url = response.url
text = response.xpath('//div[@id="mw-content-text"]//text()').extract()
lastUpdated = response.css(
'li#footer-info-lastmod::text'
).extract_first()
lastUpdated = lastUpdated.replace('This page was last edited on ','')
print(f'URL is: {url}')
print(f'Title is: {title}')
print(f'Text is: {text}')
print(f'Last updated: {lastUpdated}')
else:
print(f'This is not an article: {title}')
데이터의 항목을 만들어 정리 하기
wikiSpider/items.py 수정
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class Article(scrapy.Item):
url = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
lastUpdated = scrapy.Field()
이 클래스를 활용하는 articleItems.py 신규 생성
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from wikiSpider.items import Article
class ArticleSpider(CrawlSpider):
name = 'articleItems'
allowed_domains = ['wikipedia.org']
start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life']
rules = [
Rule(
LinkExtractor(allow='(/wiki/)((?!:).)*$'),
callback='parse_items',
fllow=True, # 링크된 페이지까지 추적
)
]
def parse_items(self, response):
article = Article()
article['url'] = response.url
article['title'] = response.css('h1::text').extract_first()
article['text'] = response.xpath('//div[@id="mw-content-text"]//text()').extract()
lastUpdated = response.css(
'li#footer-info-lastmod::text'
).extract_first()
article['lastUpdated'] = lastUpdated.replace('This page was last edited on ','')
return article
실행
scrapy runspider wikiSpider/spiders/articleItems.py
# 추출내용 파일로 쓰기( 확장자에 따라 알아서 출력)
scrapy runspider wikiSpider/spiders/articleItems.py -o articles.csv
scrapy runspider wikiSpider/spiders/articleItems.py -o articles.json
scrapy runspider wikiSpider/spiders/articleItems.py -o articles.xml
# 파이프 라인
settings.py 의 아래 3줄 주석 해제
ITEM_PIPELINES = {
"wikiSpider.pipelines.WikispiderPipeline": 300,
}
spiders/articlePipelines.py 생성 ( 데이터의 수집만 담당 - 수정하지 않음 )
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from wikiSpider.items import Article
class ArticleSpider(CrawlSpider):
name = 'articleItems'
allowed_domains = ['wikipedia.org']
start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life']
rules = [
Rule(
LinkExtractor(allow='(/wiki/)((?!:).)*$'),
callback='parse_items',
follow=True, # 링크된 페이지까지 추적
)
]
def parse_items(self, response):
article = Article()
article['url'] = response.url
article['title'] = response.css('h1::text').extract_first()
article['text'] = response.xpath('//div[@id="mw-content-text"]//text()').extract()
article['lastUpdated'] = response.css('li#footer-info-lastmod::text').extract_first()
return article
pipelines.py 수정
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from datetime import datetime
from wikiSpider.items import Article
from string import whitespace
class WikispiderPipeline(object):
def process_item(self, article, spider):
dateStr = article['lastUpdated']
dateStr = dateStr.replace('This page was last edited on', '')
dateStr = dateStr.strip()
dateStr = datetime.strptime(dateStr, '%d %B %Y, at %H:%M')
dateStr = dateStr.strftime('%Y-%m-%d %H:%M:%S')
article['lastUpdated'] = dateStr
texts = article['text'][0:50]
texts = [line for line in texts if line not in whitespace]
article['text'] = ''.join(texts)
return article
# 로그레벨 추가
로그 수준 종류
CRITICAL, ERROR, WARNING, DEBUG, INFO
# settings.py 에 추가
LOG_LEVEL = 'ERROR'
# 실행시 로그 저장파일 별도로 변경
scrapy runspider wikiSpider/spiders/articleItems.py -o articles.json --logfile wi.log
# 셀레니움을 이용한 스크래핑
크롬 설치
# 1. 구글 리포지토리 추가
cat <<EOF > /etc/yum.repos.d/google-chrome.repo
[google-chrome]
name=Google Chrome
baseurl=https://dl.google.com/linux/chrome/rpm/stable/x86_64
enabled=1
gpgcheck=1
gpgkey=https://dl.google.com/linux/linux_signing_key.pub
EOF
# 2. 설치
dnf install -y google-chrome-stable
정상 확인
google-chrome --version
실행 템플릿
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
options = webdriver.ChromeOptions()
options.add_argument('--headless') # GUI 없는 서버에서는 필수
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.binary_location = '/usr/bin/google-chrome'
driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=options
)
driver.get("http://www.python.org")
time.sleep(2)
driver.close()
ajax 화면의 실제 화면 가져오기
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
options = webdriver.ChromeOptions()
options.add_argument('--headless') # GUI 없는 서버에서는 필수
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.binary_location = '/usr/bin/google-chrome'
driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=options
)
driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html")
time.sleep(3)
print(driver.find_element(By.ID, 'content').text)
driver.close()
# sleep 을 쓰지 않고 바뀐 화면이 나오는 경우 scrap
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
options = webdriver.ChromeOptions()
options.add_argument('--headless') # GUI 없는 서버에서는 필수
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.binary_location = '/usr/bin/google-chrome'
driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=options
)
driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html")
try:
element = WebDriverWait(driver, 10).until( EC.presence_of_all_elements_located((By.ID,'loadedButton')) )
finally:
print(driver.find_element(By.ID, 'content').text)
driver.close()
# redirection 예제
0.5 초마다 체크해서 요소가 없어졌을때를 redirection 으로 인지
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
options = webdriver.ChromeOptions()
options.add_argument('--headless') # GUI 없는 서버에서는 필수
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.binary_location = '/usr/bin/google-chrome'
driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=options
)
def waitForLoad(driver):
elem = driver.find_element(By.TAG_NAME, "html")
count = 0
for _ in range(0, 20):
try:
elem == driver.find_element(By.TAG_NAME, "html")
except StaleElementReferenceException:
return
time.sleep(0.5)
print("Timing out after 10 seconds and returning")
driver.get("http://pythonscraping.com/pages/javascript/redirectDemo1.html")
waitForLoad(driver)
print(driver.page_source)
driver.close()
# 15초 지연시간으로 변경 감지
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
options = webdriver.ChromeOptions()
options.add_argument('--headless') # GUI 없는 서버에서는 필수
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.binary_location = '/usr/bin/google-chrome'
driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=options
)
driver.get("http://pythonscraping.com/pages/javascript/redirectDemo1.html")
try:
txt = 'This is the page you are looking for!'
bodyElement = WebDriverWait(driver, 15).until(
EC.presence_of_all_elements_located((
By.XPATH,
f'//body[contains(text(), "{txt}")]'
))
)
print(bodyElement[0].text)
except TimeoutException:
print('Did not find the element')
driver.close()
3. 예제 깃헙
GitHub - november11th/python-scraping: 『파이썬으로 웹 크롤러 만들기(3판)』(한빛미디어, 2025) 예제 코드 저장소입니다.
GitHub - november11th/python-scraping: 『파이썬으로 웹 크롤러 만들기(3판)』(한빛미디어, 2025) 예제 코드
『파이썬으로 웹 크롤러 만들기(3판)』(한빛미디어, 2025) 예제 코드 저장소입니다. Contribute to november11th/python-scraping development by creating an account on GitHub.
github.com