본문 바로가기
IT/python

파이썬 웹크롤러 만들기

by 가능성1g 2025. 3. 28.
반응형

 

1. 환경구성

conda create -n scraping python=3.10

conda activate scraping

pip install notebook

pip install bs4

pip install lxml

pip install scrapy

pip install pandas

pip install requests

pip install selenium

pip install webdriver-manager

 

2. 예제들

-- 쥬피터 노트북 이용

# wsl 에서 root 로 사용중이므로 옵션 붙임

jupyter notebook --allow-root

from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://www.pythonscraping.com/pages/page1.html')
bs = BeautifulSoup(html.read(), 'html.parser')
print(bs.h1)

 

-- vscode 이용

스크레피이 활용하기

scrapy startproject wikiSpider

 

# 단순 링크 추출

파일 작성 wikiSpider/wikiSpider/spiders/article.py

from scrapy import Spider, Request
class ArticleSpider(Spider):
    name = 'article'

    def start_requests(self):
        urls = [
            'http://en.wikipedia.org/wiki/Python_%28programming_language%29',
            'https://en.wikipedia.org/wiki/Functional_programming',
            'https://en.wikipedia.org/wiki/Monty_Python',            
        ]
        return [Request(url=url, callback=self.parse) for url in urls]
    
    def parse(self, response):
        url = response.url
        title = response.css('h1::text').extract_first()
        print(f"URL is: {url}")
        print(f'Title is: {title}')

 

실행

# 실행 위치 /root/py-workspace/scraping/wikiSpider/wikiSpider

scrapy runspider spiders/article.py

 

# 컨텐츠 추출, 마지막 수정일 추출

파일작성 wikiSpider/wikiSpider/spiders/articles.py

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

class ArticleSpider(CrawlSpider):
    name = 'article'
    allowed_domains = ['wikipedia.org']
    start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life']
    rules = [
        Rule(
            LinkExtractor(allow=r'.*'),
            callback='parse_items',
            follow=True
        )
    ]
    
    def parse_items(self, response):
        url = response.url
        title = response.css('span.mw-page-title-main::text').extract_first()
        text = response.xpath('//div[@id="mw-content-text"]//text()').extract()
        lastUpdated = response.css(
            'li#footer-info-lastmod::text'
        ).extract_first()
        lastUpdated = lastUpdated.replace('This page was last edited on ','')
        print(f'URL is: {url}')
        print(f'Title is: {title}')
        print(f'Text is: {text}')
        print(f'Last updated: {lastUpdated}')

# 실행

scrapy runspider wikiSpider/spiders/articles.py

 

# 각페이지를 article 로 구분해서 처리

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

class ArticleSpider(CrawlSpider):
    name = 'article'
    allowed_domains = ['wikipedia.org']
    start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life']
    rules = [
        Rule(
          LinkExtractor(allow='(/wiki/)((?!:).)*$')  ,
          callback='parse_items',
          cb_kwargs={'is_article': True} ## 콜백함수에 전달
        ),
        Rule(
            LinkExtractor(allow='.*'),
            callback='parse_items', ## 페이지 내용 구문 분석
            cb_kwargs={'is_article': False}
        )
    ]
    
    def parse_items(self, response, is_article):
        print(response.url)
        title = response.css('span.mw-page-title-main::text').extract_first()
        if is_article:
            url = response.url
            text = response.xpath('//div[@id="mw-content-text"]//text()').extract()
            lastUpdated = response.css(
            'li#footer-info-lastmod::text'
        ).extract_first()
            lastUpdated = lastUpdated.replace('This page was last edited on ','')
            print(f'URL is: {url}')
            print(f'Title is: {title}')
            print(f'Text is: {text}')
            print(f'Last updated: {lastUpdated}')
        else:
            print(f'This is not an article: {title}')

 

데이터의 항목을 만들어 정리 하기

wikiSpider/items.py 수정

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

class Article(scrapy.Item):
    url = scrapy.Field()
    title = scrapy.Field()
    text = scrapy.Field()
    lastUpdated = scrapy.Field()

 

이 클래스를 활용하는 articleItems.py 신규 생성

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from wikiSpider.items import Article

class ArticleSpider(CrawlSpider):
    name = 'articleItems'
    allowed_domains = ['wikipedia.org']
    start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life']
    rules = [
        Rule(
          LinkExtractor(allow='(/wiki/)((?!:).)*$'),
          callback='parse_items',
          fllow=True, # 링크된 페이지까지 추적
        )
    ]
    
    def parse_items(self, response):
        article = Article()
        article['url'] = response.url
        article['title'] = response.css('h1::text').extract_first()
        article['text'] = response.xpath('//div[@id="mw-content-text"]//text()').extract()
        lastUpdated = response.css(
            'li#footer-info-lastmod::text'
        ).extract_first()
        article['lastUpdated'] = lastUpdated.replace('This page was last edited on ','')
        return article

 

실행

scrapy runspider wikiSpider/spiders/articleItems.py

 

# 추출내용 파일로 쓰기( 확장자에 따라 알아서 출력)

scrapy runspider wikiSpider/spiders/articleItems.py -o articles.csv

scrapy runspider wikiSpider/spiders/articleItems.py -o articles.json

scrapy runspider wikiSpider/spiders/articleItems.py -o articles.xml

 

# 파이프 라인 

settings.py 의 아래 3줄 주석 해제

ITEM_PIPELINES = {
    "wikiSpider.pipelines.WikispiderPipeline": 300,
}

 

spiders/articlePipelines.py 생성 ( 데이터의 수집만 담당 - 수정하지 않음 )

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from wikiSpider.items import Article

class ArticleSpider(CrawlSpider):
    name = 'articleItems'
    allowed_domains = ['wikipedia.org']
    start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life']
    rules = [
        Rule(
          LinkExtractor(allow='(/wiki/)((?!:).)*$'),
          callback='parse_items',
          follow=True, # 링크된 페이지까지 추적
        )
    ]
    
    def parse_items(self, response):
        article = Article()
        article['url'] = response.url
        article['title'] = response.css('h1::text').extract_first()
        article['text'] = response.xpath('//div[@id="mw-content-text"]//text()').extract()
        article['lastUpdated'] = response.css('li#footer-info-lastmod::text').extract_first()
        return article

 

pipelines.py 수정

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from datetime import datetime
from wikiSpider.items import Article
from string import whitespace

class WikispiderPipeline(object):
    def process_item(self, article, spider):
        dateStr = article['lastUpdated']
        dateStr = dateStr.replace('This page was last edited on', '')
        dateStr = dateStr.strip()
        dateStr = datetime.strptime(dateStr, '%d %B %Y, at %H:%M')
        dateStr = dateStr.strftime('%Y-%m-%d %H:%M:%S')
        article['lastUpdated'] = dateStr
        
        texts = article['text'][0:50]
        texts = [line for line in texts if line not in whitespace]
        article['text'] = ''.join(texts)
        return article

 

# 로그레벨 추가

로그 수준 종류

CRITICAL, ERROR, WARNING, DEBUG, INFO

 

# settings.py 에 추가

LOG_LEVEL = 'ERROR'

 

# 실행시 로그 저장파일 별도로 변경

scrapy runspider wikiSpider/spiders/articleItems.py -o articles.json --logfile wi.log

 

# 셀레니움을 이용한 스크래핑

 

크롬 설치

# 1. 구글 리포지토리 추가
cat <<EOF > /etc/yum.repos.d/google-chrome.repo
[google-chrome]
name=Google Chrome
baseurl=https://dl.google.com/linux/chrome/rpm/stable/x86_64
enabled=1
gpgcheck=1
gpgkey=https://dl.google.com/linux/linux_signing_key.pub
EOF

# 2. 설치
dnf install -y google-chrome-stable

 

정상 확인

google-chrome --version

 

실행 템플릿

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time

options = webdriver.ChromeOptions()
options.add_argument('--headless')  # GUI 없는 서버에서는 필수
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.binary_location = '/usr/bin/google-chrome'

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

driver.get("http://www.python.org")
time.sleep(2)
driver.close()

 

ajax 화면의 실제 화면 가져오기

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time

options = webdriver.ChromeOptions()
options.add_argument('--headless')  # GUI 없는 서버에서는 필수
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.binary_location = '/usr/bin/google-chrome'

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html")
time.sleep(3)
print(driver.find_element(By.ID, 'content').text)
driver.close()

 

# sleep 을 쓰지 않고 바뀐 화면이 나오는 경우 scrap

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time

options = webdriver.ChromeOptions()
options.add_argument('--headless')  # GUI 없는 서버에서는 필수
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.binary_location = '/usr/bin/google-chrome'

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html")
    
try:
    element = WebDriverWait(driver, 10).until( EC.presence_of_all_elements_located((By.ID,'loadedButton')) )
finally:
    print(driver.find_element(By.ID, 'content').text)
    driver.close()

 

# redirection 예제

0.5 초마다 체크해서 요소가 없어졌을때를 redirection 으로 인지

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time

options = webdriver.ChromeOptions()
options.add_argument('--headless')  # GUI 없는 서버에서는 필수
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.binary_location = '/usr/bin/google-chrome'

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

def waitForLoad(driver):
    elem = driver.find_element(By.TAG_NAME, "html")
    count = 0
    for _ in range(0, 20):
        try:
            elem == driver.find_element(By.TAG_NAME, "html")
        except StaleElementReferenceException:
            return
        time.sleep(0.5)
    print("Timing out after 10 seconds and returning")

driver.get("http://pythonscraping.com/pages/javascript/redirectDemo1.html")
waitForLoad(driver)
print(driver.page_source)
driver.close()

 

# 15초 지연시간으로 변경 감지

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time

options = webdriver.ChromeOptions()
options.add_argument('--headless')  # GUI 없는 서버에서는 필수
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.binary_location = '/usr/bin/google-chrome'

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)
driver.get("http://pythonscraping.com/pages/javascript/redirectDemo1.html")
try:
    txt = 'This is the page you are looking for!'
    bodyElement = WebDriverWait(driver, 15).until(
        EC.presence_of_all_elements_located((
            By.XPATH,
            f'//body[contains(text(), "{txt}")]'
        ))
    )
    print(bodyElement[0].text)
except TimeoutException:
    print('Did not find the element')
    
driver.close()

 

 

 

3. 예제 깃헙

GitHub - november11th/python-scraping: 『파이썬으로 웹 크롤러 만들기(3판)』(한빛미디어, 2025) 예제 코드 저장소입니다.

 

GitHub - november11th/python-scraping: 『파이썬으로 웹 크롤러 만들기(3판)』(한빛미디어, 2025) 예제 코드

『파이썬으로 웹 크롤러 만들기(3판)』(한빛미디어, 2025) 예제 코드 저장소입니다. Contribute to november11th/python-scraping development by creating an account on GitHub.

github.com

 

 

반응형