```

# 用Python实现一个简单的网络爬虫
import requests
from bs4 import BeautifulSoup
import csv
import time
import random
import re
import os
import urllib.parse
import urllib.robotparser
class SimpleWebCrawler:
def __init__(self, delay=1, max_pages=100):
self.delay = delay # 请求间隔时刻(秒)
self.max_pages = max_pages # 最大爬取页面数
self.visited_urls = set() # 已访问的URL集合
self.pages_crawled = 0 # 已爬取页面数
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
})
def crawl(self, start_url, max_depth=2):
"""开始爬虫,从start_url开始,最大深度为max_depth"""
urls_to_crawl = [(start_url, 0)] # URL队列,包含URL和当前深度
while urls_to_crawl and self.pages_crawled < self.max_pages:
url, depth = urls_to_crawl.pop(0)
if url not in self.visited_urls and depth <= max_depth:
try:
print(f"正在爬取: {url} (深度 {depth})")
html_content = self.fetch_page(url)
if html_content:
self.save_content(url, html_content)
links = self.extract_links(html_content, url)
for link in links:
if link not in self.visited_urls:
urls_to_crawl.append((link, depth + 1))
time.sleep(self.delay + random.uniform(0.5, 1.5)) # 随机延迟
except Exception as e:
print(f"爬取{url}时出错: {e}")
def fetch_page(self, url):
"""获取页面内容"""
try:
response = self.session.get(url, timeout=10)
response.raise_for_status() # 检查请求是否成功
return response.text
except requests.RequestException as e:
print(f"请求{url}时出错: {e}")
return None
def extract_links(self, html_content, base_url):
"""从HTML内容中提取链接"""
soup = BeautifulSoup(html_content, 'html.parser')
links = []
for link in soup.find_all('a', href=True):
href = link['href']
absolute_url = urllib.parse.urljoin(base_url, href)
if absolute_url.startswith('http'):
links.append(absolute_url)
return links
def save_content(self, url, content):
"""保存页面内容到文件"""
# 创建保存目录
if not os.path.exists('crawled_pages'):
os.makedirs('crawled_pages')
# 创建文件名(URL编码)
filename = urllib.parse.quote(url, safe='') + '.html'
filepath = os.path.join('crawled_pages', filename)
try:
with open(filepath, 'w', encoding='utf-8') as f:
f.write(content)
print(f"页面已保存到: {filepath}")
except Exception as e:
print(f"保存文件时出错: {e}")
self.visited_urls.add(url)
self.pages_crawled += 1
def main():
# 创建爬虫实例
crawler = SimpleWebCrawler(delay=2, max_pages=50)
# 从指定URL开始爬取,最大深度为2
start_url = 'https://news.sina.com.cn/' # 可以替换为任何你想爬取的网站
crawler.crawl(start_url, max_depth=2)
print(f"\n爬虫完成!共爬取{len(crawler.visited_urls)}个页面")
if __name__ == "__main__":
main()
```
这个爬虫程序具有下面内容功能:
- 从指定的起始URL开始爬取
- 自动提取页面中的链接并继续爬取
- 可以限制爬取深度
- 随机延迟避免过于频繁的请求
- 保存页面内容到本地文件
- 使用随机User-Agent模拟诚实浏览器
- 错误处理和超时设置
使用示例:
```python
# 爬取新浪新闻网站,最大深度为1(只爬取首页和直接链接)
crawler = SimpleWebCrawler(delay=2, max_pages=20)
crawler.crawl('https://news.sina.com.cn/', max_depth=1)
```
注意:请确保遵守目标网站的robots.txt制度,并尊重网站的使用条款。这个示例程序仅用于教育和进修目的。
# 爬虫程序完整实现
```python
import requests
from bs4 import BeautifulSoup
import csv
import time
import random
import re
import os
import urllib.parse
import urllib.robotparser
import threading
from queue import Queue
import threading
from urllib.parse import urlparse
class AdvancedWebCrawler:
def __init__(self, delay=1, max_pages=100, max_threads=5):
self.delay = delay
self.max_pages = max_pages
self.max_threads = max_threads
self.visited_urls = set()
self.pages_crawled = 0
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
})
self.url_queue = Queue()
self.lock = threading.Lock()
def can_fetch(self, url):
"""检查robots.txt制度"""
try:
rp = urllib.robotparser.RobotFileParser()
rp.set_url(urllib.parse.urljoin(url, '/robots.txt'))
rp.read()
return rp.can_fetch(self.session.headers['User-Agent'], url)
except:
return True # 如果无法获取robots.txt,默认允许爬取
def crawl(self, start_url, max_depth=2):
"""开始多线程爬虫"""
if not self.can_fetch(start_url):
print(f"robots.txt禁止爬取: {start_url}")
return
self.url_queue.put((start_url, 0))
threads = []
for i in range(self.max_threads):
thread = threading.Thread(target=self.worker)
thread.daemon = True
thread.start()
threads.append(thread)
self.url_queue.join()
print(f"爬虫完成!共爬取{self.pages_crawled}个页面")
def worker(self):
"""职业线程"""
while True:
try:
url, depth = self.url_queue.get(timeout=10)
if url not in self.visited_urls:
self.process_url(url, depth)
self.url_queue.task_done()
except:
break
def process_url(self, url, depth):
"""处理单个URL"""
try:
print(f"正在爬取: {url} (深度 {depth})")
html_content = self.fetch_page(url)
if html_content:
self.save_content(url, html_content)
# 提取链接并添加到队列
links = self.extract_links(html_content, url)
for link in links:
if link not in self.visited_urls and depth < self.max_pages:
self.url_queue.put((link, depth + 1))
# 随机延迟
time.sleep(self.delay + random.uniform(0.5, 1.5))
except Exception as e:
print(f"爬取{url}时出错: {e}")
def fetch_page(self, url):
"""获取页面内容"""
try:
response = self.session.get(url, timeout=10)
response.raise_for_status()
return response.text
except requests.RequestException as e:
print(f"请求{url}时出错: {e}")
return None
def extract_links(self, html_content, base_url):
"""从HTML内容中提取链接"""
soup = BeautifulSoup(html_content, 'html.parser')
links = []
for link in soup.find_all('a', href=True):
href = link['href']
absolute_url = urllib.parse.urljoin(base_url, href)
if absolute_url.startswith('http'):
links.append(absolute_url)
return links
def save_content(self, url, content):
"""保存页面内容到文件"""
if not os.path.exists('crawled_pages'):
os.makedirs('crawled_pages')
filename = urllib.parse.quote(url, safe='') + '.html'
filepath = os.path.join('crawled_pages', filename)
try:
with open(filepath, 'w', encoding='utf-8') as f:
f.write(content)
print(f"页面已保存到: {filepath}")
except Exception as e:
print(f"保存文件时出错: {e}")
with self.lock:
self.visited_urls.add(url)
self.pages_crawled += 1
def main():
# 创建高品质爬虫实例
crawler = AdvancedWebCrawler(delay=2, max_pages=50, max_threads=3)
# 从指定URL开始爬取,最大深度为2
start_url = 'https://news.sina.com.cn/' # 可以替换为任何你想爬取的网站
crawler.crawl(start_url, max_depth=2)
if __name__ == "__main__":
main()
```
这个增强版本的爬虫程序具有下面内容特点:
1. |多线程支持|:使用多线程同时爬取多个页面,进步效率
2. |队列管理|:使用队列管理待爬取的URL
3. |robots.txt支持|:检查并遵守网站的robots.txt制度
4. |线程安全|:使用锁机制确保线程安全
5. |错误处理|:完善的错误处理和超时设置
使用技巧:
```python
# 创建爬虫实例
crawler = AdvancedWebCrawler(delay=2, max_pages=50, max_threads=3)
# 开始爬取
crawler.crawl('https://example.com', max_depth=2)
```
这个程序可以小编认为一个基础框架,你可以根据需要添加更多功能,如:
- 支持代理服务器
- 支持数据库存储
- 支持图片下载
- 支持JavaScript渲染的页面(需要Selenium)
- 支持API接口调用
请确保在爬取任何网站前遵守相关法律法规和网站的使用条款。
# 完整的多功能爬虫程序
```python
import requests
from bs4 import BeautifulSoup
import csv
import time
import random
import re
import os
import urllib.parse
import urllib.robotparser
import threading
from queue import Queue
import json
import hashlib
from urllib.parse import urlparse
import sqlite3
class AdvancedWebCrawler:
def __init__(self, delay=1, max_pages=100, max_threads=5, use_database=False):
self.delay = delay
self.max_pages = max_pages
self.max_threads = max_threads
self.visited_urls = set()
self.pages_crawled = 0
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
})
self.url_queue = Queue()
self.lock = threading.Lock()
# 数据库支持
self.use_database = use_database
if use_database:
self.init_database()
def init_database(self):
"""初始化SQLite数据库"""
self.conn = sqlite3.connect('crawler.db', check_same_thread=False)
self.cursor = self.conn.cursor()
self.cursor.execute('''
CREATE TABLE IF NOT EXISTS pages (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT UNIQUE,
content TEXT,
title TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
self.conn.commit()
def can_fetch(self, url):
"""检查robots.txt制度"""
try:
rp = urllib.robotparser.RobotFileParser()
rp.set_url(urllib.parse.urljoin(url, '/robots.txt'))
rp.read()
return rp.can_fetch(self.session.headers['User-Agent'], url)
except:
return True # 如果无法获取robots.txt,默认允许爬取
def crawl(self, start_url, max_depth=2):
"""开始多线程爬虫"""
if not self.can_fetch(start_url):
print(f"robots.txt禁止爬取: {start_url}")
return
self.url_queue.put((start_url, 0))
threads = []
for i in range(self.max_threads):
thread = threading.Thread(target=self.worker)
thread.daemon = True
thread.start()
threads.append(thread)
self.url_queue.join()
print(f"爬虫完成!共爬取{self.pages_crawled}个页面")
if self.use_database:
self.conn.close()
def worker(self):
"""职业线程"""
while True:
try:
url, depth = self.url_queue.get(timeout=10)
if url not in self.visited_urls:
self.process_url(url, depth)
self.url_queue.task_done()
except:
break
def process_url(self, url, depth):
"""处理单个URL"""
try:
print(f"正在爬取: {url} (深度 {depth})")
html_content = self.fetch_page(url)
if html_content:
# 提取深入了解和内容
title = self.extract_title(html_content)
content = self.extract_text(html_content)
# 保存内容
self.save_content(url, title, content)
# 提取链接并添加到队列
links = self.extract_links(html_content, url)
for link in links:
if link not in self.visited_urls and self.pages_crawled < self.max_pages:
self.url_queue.put((link, depth + 1))
# 随机延迟
time.sleep(self.delay + random.uniform(0.5, 1.5))
except Exception as e:
print(f"爬取{url}时出错: {e}")
def fetch_page(self, url):
"""获取页面内容"""
try:
response = self.session.get(url, timeout=10)
response.raise_for_status()
return response.text
except requests.RequestException as e:
print(f"请求{url}时出错: {e}")
return None
def extract_title(self, html_content):
"""提取页面深入了解"""
soup = BeautifulSoup(html_content, 'html.parser')
title_tag = soup.find('title')
return title_tag.get_text().strip() if title_tag else 'No Title'
def extract_text(self, html_content):
"""提取页面纯文本内容"""
soup = BeautifulSoup(html_content, 'html.parser')
# 移除脚本和样式标签
for script in soup(["script", "style"]):
script.extract()
return soup.get_text()
def extract_links(self, html_content, base_url):
"""从HTML内容中提取链接"""
soup = BeautifulSoup(html_content, 'html.parser')
links = []
for link in soup.find_all('a', href=True):
href = link['href']
absolute_url = urllib.parse.urljoin(base_url, href)
if absolute_url.startswith('http'):
links.append(absolute_url)
return links
def save_content(self, url, title, content):
"""保存页面内容"""
if self.use_database:
# 保存到数据库
try:
self.cursor.execute(
"INSERT OR REPLACE INTO pages (url, title, content) VALUES (?, ?, ?)",
(url, title, content)
)
self.conn.commit()
print(f"页面已保存到数据库: {url}")
except Exception as e:
print(f"保存到数据库时出错: {e}")
else:
# 保存到文件
if not os.path.exists('crawled_pages'):
os.makedirs('crawled_pages')
# 创建文件名(URL的MD5哈希)
url_hash = hashlib.md5(url.encode()).hexdigest()
filename = f"{url_hash}.txt"
filepath = os.path.join('crawled_pages', filename)
try:
with open(filepath, 'w', encoding='utf-8') as f:
f.write(f"URL: {url}\n")
f.write(f"Title: {title}\n")
f.write(f"Content:\n{content}\n")
print(f"页面已保存到: {filepath}")
except Exception as e:
print(f"保存文件时出错: {e}")
with self.lock:
self.visited_urls.add(url)
self.pages_crawled += 1
def main():
# 创建高品质爬虫实例,使用数据库存储
crawler = AdvancedWebCrawler(delay=2, max_pages=50, max_threads=3, use_database=True)
# 从指定URL开始爬取,最大深度为2
start_url = 'https://news.sina.com.cn/' # 可以替换为任何你想爬取的网站
crawler.crawl(start_url, max_depth=2)
if __name__ == "__main__":
main()
```
这个完整版本的爬虫程序具有下面内容特点:
1. |多线程支持|:使用多线程同时爬取多个页面
2. |数据库支持|:可选使用SQLite数据库存储爬取内容
3. |robots.txt支持|:检查并遵守网站的robots.txt制度
4. |纯文本提取|:提取页面的纯文本内容,去除HTML标签
5. |深入了解提取|:提取页面深入了解
6. |内容保存|:保存URL、深入了解和纯文本内容
7. |错误处理|:完善的错误处理和超时设置
8. |线程安全|:使用锁机制确保线程安全
使用示例:
```python
# 使用文件存储
crawler = AdvancedWebCrawler(delay=2, max_pages=50, max_threads=3, use_database=False)
# 使用数据库存储
crawler = AdvancedWebCrawler(delay=2, max_pages=50, max_threads=3, use_database=True)
```
这个程序可以小编认为一个功能完整的网络爬虫框架,适用于各种网页内容抓取任务。请确保在爬取任何网站前遵守相关法律法规和网站的使用条款。
