```-星野88手游

摘要：#用Python实现一个简单的网络爬虫importrequestsfrombs4importBeautifulSoupimportcsvimporttimeimportrandomimportreimportosimporturllib.parseimporturllib.robotparserclassSimpleWebCrawler:def__init__(self,de,```

# 用Python实现一个简单的网络爬虫

import requests

from bs4 import BeautifulSoup

import csv

import time

import random

import re

import os

import urllib.parse

import urllib.robotparser

class SimpleWebCrawler:

def __init__(self, delay=1, max_pages=100):

self.delay = delay # 请求间隔时刻（秒）

self.max_pages = max_pages # 最大爬取页面数

self.visited_urls = set() # 已访问的URL集合

self.pages_crawled = 0 # 已爬取页面数

self.session = requests.Session()

self.session.headers.update({

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'

})

def crawl(self, start_url, max_depth=2):

"""开始爬虫，从start_url开始，最大深度为max_depth"""

urls_to_crawl = [(start_url, 0)] # URL队列，包含URL和当前深度

while urls_to_crawl and self.pages_crawled < self.max_pages:

url, depth = urls_to_crawl.pop(0)

if url not in self.visited_urls and depth <= max_depth:

try:

print(f"正在爬取: {url} (深度 {depth})")

html_content = self.fetch_page(url)

if html_content:

self.save_content(url, html_content)

links = self.extract_links(html_content, url)

for link in links:

if link not in self.visited_urls:

urls_to_crawl.append((link, depth + 1))

time.sleep(self.delay + random.uniform(0.5, 1.5)) # 随机延迟

except Exception as e:

print(f"爬取{url}时出错: {e}")

def fetch_page(self, url):

"""获取页面内容"""

try:

response = self.session.get(url, timeout=10)

response.raise_for_status() # 检查请求是否成功

return response.text

except requests.RequestException as e:

print(f"请求{url}时出错: {e}")

return None

def extract_links(self, html_content, base_url):

"""从HTML内容中提取链接"""

soup = BeautifulSoup(html_content, 'html.parser')

links = []

for link in soup.find_all('a', href=True):

href = link['href']

absolute_url = urllib.parse.urljoin(base_url, href)

if absolute_url.startswith('http'):

links.append(absolute_url)

return links

def save_content(self, url, content):

"""保存页面内容到文件"""

# 创建保存目录

if not os.path.exists('crawled_pages'):

os.makedirs('crawled_pages')

# 创建文件名（URL编码）

filename = urllib.parse.quote(url, safe='') + '.html'

filepath = os.path.join('crawled_pages', filename)

try:

with open(filepath, 'w', encoding='utf-8') as f:

f.write(content)

print(f"页面已保存到: {filepath}")

except Exception as e:

print(f"保存文件时出错: {e}")

self.visited_urls.add(url)

self.pages_crawled += 1

def main():

# 创建爬虫实例

crawler = SimpleWebCrawler(delay=2, max_pages=50)

# 从指定URL开始爬取，最大深度为2

start_url = 'https://news.sina.com.cn/' # 可以替换为任何你想爬取的网站

crawler.crawl(start_url, max_depth=2)

print(f"\n爬虫完成！共爬取{len(crawler.visited_urls)}个页面")

if __name__ == "__main__":

main()

```

这个爬虫程序具有下面内容功能：

- 从指定的起始URL开始爬取

- 自动提取页面中的链接并继续爬取

- 可以限制爬取深度

- 随机延迟避免过于频繁的请求

- 保存页面内容到本地文件

- 使用随机User-Agent模拟诚实浏览器

- 错误处理和超时设置

使用示例：

```python

# 爬取新浪新闻网站，最大深度为1（只爬取首页和直接链接）

crawler = SimpleWebCrawler(delay=2, max_pages=20)

crawler.crawl('https://news.sina.com.cn/', max_depth=1)

```

注意：请确保遵守目标网站的robots.txt制度，并尊重网站的使用条款。这个示例程序仅用于教育和进修目的。

# 爬虫程序完整实现

```python

import requests

from bs4 import BeautifulSoup

import csv

import time

import random

import re

import os

import urllib.parse

import urllib.robotparser

import threading

from queue import Queue

import threading

from urllib.parse import urlparse

class AdvancedWebCrawler:

def __init__(self, delay=1, max_pages=100, max_threads=5):

self.delay = delay

self.max_pages = max_pages

self.max_threads = max_threads

self.visited_urls = set()

self.pages_crawled = 0

self.session = requests.Session()

self.session.headers.update({

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'

})

self.url_queue = Queue()

self.lock = threading.Lock()

def can_fetch(self, url):

"""检查robots.txt制度"""

try:

rp = urllib.robotparser.RobotFileParser()

rp.set_url(urllib.parse.urljoin(url, '/robots.txt'))

rp.read()

return rp.can_fetch(self.session.headers['User-Agent'], url)

except:

return True # 如果无法获取robots.txt，默认允许爬取

def crawl(self, start_url, max_depth=2):

"""开始多线程爬虫"""

if not self.can_fetch(start_url):

print(f"robots.txt禁止爬取: {start_url}")

return

self.url_queue.put((start_url, 0))

threads = []

for i in range(self.max_threads):

thread = threading.Thread(target=self.worker)

thread.daemon = True

thread.start()

threads.append(thread)

self.url_queue.join()

print(f"爬虫完成！共爬取{self.pages_crawled}个页面")

def worker(self):

"""职业线程"""

while True:

try:

url, depth = self.url_queue.get(timeout=10)

if url not in self.visited_urls:

self.process_url(url, depth)

self.url_queue.task_done()

except:

break

def process_url(self, url, depth):

"""处理单个URL"""

try:

print(f"正在爬取: {url} (深度 {depth})")

html_content = self.fetch_page(url)

if html_content:

self.save_content(url, html_content)

# 提取链接并添加到队列

links = self.extract_links(html_content, url)

for link in links:

if link not in self.visited_urls and depth < self.max_pages:

self.url_queue.put((link, depth + 1))

# 随机延迟

time.sleep(self.delay + random.uniform(0.5, 1.5))

except Exception as e:

print(f"爬取{url}时出错: {e}")

def fetch_page(self, url):

"""获取页面内容"""

try:

response = self.session.get(url, timeout=10)

response.raise_for_status()

return response.text

except requests.RequestException as e:

print(f"请求{url}时出错: {e}")

return None

def extract_links(self, html_content, base_url):

"""从HTML内容中提取链接"""

soup = BeautifulSoup(html_content, 'html.parser')

links = []

for link in soup.find_all('a', href=True):

href = link['href']

absolute_url = urllib.parse.urljoin(base_url, href)

if absolute_url.startswith('http'):

links.append(absolute_url)

return links

def save_content(self, url, content):

"""保存页面内容到文件"""

if not os.path.exists('crawled_pages'):

os.makedirs('crawled_pages')

filename = urllib.parse.quote(url, safe='') + '.html'

filepath = os.path.join('crawled_pages', filename)

try:

with open(filepath, 'w', encoding='utf-8') as f:

f.write(content)

print(f"页面已保存到: {filepath}")

except Exception as e:

print(f"保存文件时出错: {e}")

with self.lock:

self.visited_urls.add(url)

self.pages_crawled += 1

def main():

# 创建高品质爬虫实例

crawler = AdvancedWebCrawler(delay=2, max_pages=50, max_threads=3)

# 从指定URL开始爬取，最大深度为2

start_url = 'https://news.sina.com.cn/' # 可以替换为任何你想爬取的网站

crawler.crawl(start_url, max_depth=2)

if __name__ == "__main__":

main()

```

这个增强版本的爬虫程序具有下面内容特点：

1. |多线程支持|：使用多线程同时爬取多个页面，进步效率

2. |队列管理|：使用队列管理待爬取的URL

3. |robots.txt支持|：检查并遵守网站的robots.txt制度

4. |线程安全|：使用锁机制确保线程安全

5. |错误处理|：完善的错误处理和超时设置

使用技巧：

```python

# 创建爬虫实例

crawler = AdvancedWebCrawler(delay=2, max_pages=50, max_threads=3)

# 开始爬取

crawler.crawl('https://example.com', max_depth=2)

```

这个程序可以小编认为一个基础框架，你可以根据需要添加更多功能，如：

- 支持代理服务器

- 支持数据库存储

- 支持图片下载

- 支持JavaScript渲染的页面（需要Selenium）

- 支持API接口调用

请确保在爬取任何网站前遵守相关法律法规和网站的使用条款。

# 完整的多功能爬虫程序

```python

import requests

from bs4 import BeautifulSoup

import csv

import time

import random

import re

import os

import urllib.parse

import urllib.robotparser

import threading

from queue import Queue

import json

import hashlib

from urllib.parse import urlparse

import sqlite3

class AdvancedWebCrawler:

def __init__(self, delay=1, max_pages=100, max_threads=5, use_database=False):

self.delay = delay

self.max_pages = max_pages

self.max_threads = max_threads

self.visited_urls = set()

self.pages_crawled = 0

self.session = requests.Session()

self.session.headers.update({

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'

})

self.url_queue = Queue()

self.lock = threading.Lock()

# 数据库支持

self.use_database = use_database

if use_database:

self.init_database()

def init_database(self):

"""初始化SQLite数据库"""

self.conn = sqlite3.connect('crawler.db', check_same_thread=False)

self.cursor = self.conn.cursor()

self.cursor.execute('''

CREATE TABLE IF NOT EXISTS pages (

id INTEGER PRIMARY KEY AUTOINCREMENT,

url TEXT UNIQUE,

content TEXT,

title TEXT,

created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP

)

''')

self.conn.commit()

def can_fetch(self, url):

"""检查robots.txt制度"""

try:

rp = urllib.robotparser.RobotFileParser()

rp.set_url(urllib.parse.urljoin(url, '/robots.txt'))

rp.read()

return rp.can_fetch(self.session.headers['User-Agent'], url)

except:

return True # 如果无法获取robots.txt，默认允许爬取

def crawl(self, start_url, max_depth=2):

"""开始多线程爬虫"""

if not self.can_fetch(start_url):

print(f"robots.txt禁止爬取: {start_url}")

return

self.url_queue.put((start_url, 0))

threads = []

for i in range(self.max_threads):

thread = threading.Thread(target=self.worker)

thread.daemon = True

thread.start()

threads.append(thread)

self.url_queue.join()

print(f"爬虫完成！共爬取{self.pages_crawled}个页面")

if self.use_database:

self.conn.close()

def worker(self):

"""职业线程"""

while True:

try:

url, depth = self.url_queue.get(timeout=10)

if url not in self.visited_urls:

self.process_url(url, depth)

self.url_queue.task_done()

except:

break

def process_url(self, url, depth):

"""处理单个URL"""

try:

print(f"正在爬取: {url} (深度 {depth})")

html_content = self.fetch_page(url)

if html_content:

# 提取深入了解和内容

title = self.extract_title(html_content)

content = self.extract_text(html_content)

# 保存内容

self.save_content(url, title, content)

# 提取链接并添加到队列

links = self.extract_links(html_content, url)

for link in links:

if link not in self.visited_urls and self.pages_crawled < self.max_pages:

self.url_queue.put((link, depth + 1))

# 随机延迟

time.sleep(self.delay + random.uniform(0.5, 1.5))

except Exception as e:

print(f"爬取{url}时出错: {e}")

def fetch_page(self, url):

"""获取页面内容"""

try:

response = self.session.get(url, timeout=10)

response.raise_for_status()

return response.text

except requests.RequestException as e:

print(f"请求{url}时出错: {e}")

return None

def extract_title(self, html_content):

"""提取页面深入了解"""

soup = BeautifulSoup(html_content, 'html.parser')

title_tag = soup.find('title')

return title_tag.get_text().strip() if title_tag else 'No Title'

def extract_text(self, html_content):

"""提取页面纯文本内容"""

soup = BeautifulSoup(html_content, 'html.parser')

# 移除脚本和样式标签

for script in soup(["script", "style"]):

script.extract()

return soup.get_text()

def extract_links(self, html_content, base_url):

"""从HTML内容中提取链接"""

soup = BeautifulSoup(html_content, 'html.parser')

links = []

for link in soup.find_all('a', href=True):

href = link['href']

absolute_url = urllib.parse.urljoin(base_url, href)

if absolute_url.startswith('http'):

links.append(absolute_url)

return links

def save_content(self, url, title, content):

"""保存页面内容"""

if self.use_database:

# 保存到数据库

try:

self.cursor.execute(

"INSERT OR REPLACE INTO pages (url, title, content) VALUES (?, ?, ?)",

(url, title, content)

)

self.conn.commit()

print(f"页面已保存到数据库: {url}")

except Exception as e:

print(f"保存到数据库时出错: {e}")

else:

# 保存到文件

if not os.path.exists('crawled_pages'):

os.makedirs('crawled_pages')

# 创建文件名（URL的MD5哈希）

url_hash = hashlib.md5(url.encode()).hexdigest()

filename = f"{url_hash}.txt"

filepath = os.path.join('crawled_pages', filename)

try:

with open(filepath, 'w', encoding='utf-8') as f:

f.write(f"URL: {url}\n")

f.write(f"Title: {title}\n")

f.write(f"Content:\n{content}\n")

print(f"页面已保存到: {filepath}")

except Exception as e:

print(f"保存文件时出错: {e}")

with self.lock:

self.visited_urls.add(url)

self.pages_crawled += 1

def main():

# 创建高品质爬虫实例，使用数据库存储

crawler = AdvancedWebCrawler(delay=2, max_pages=50, max_threads=3, use_database=True)

# 从指定URL开始爬取，最大深度为2

start_url = 'https://news.sina.com.cn/' # 可以替换为任何你想爬取的网站

crawler.crawl(start_url, max_depth=2)

if __name__ == "__main__":

main()

```

这个完整版本的爬虫程序具有下面内容特点：

1. |多线程支持|：使用多线程同时爬取多个页面

2. |数据库支持|：可选使用SQLite数据库存储爬取内容

3. |robots.txt支持|：检查并遵守网站的robots.txt制度

4. |纯文本提取|：提取页面的纯文本内容，去除HTML标签

5. |深入了解提取|：提取页面深入了解

6. |内容保存|：保存URL、深入了解和纯文本内容

7. |错误处理|：完善的错误处理和超时设置

8. |线程安全|：使用锁机制确保线程安全

使用示例：

```python

# 使用文件存储

crawler = AdvancedWebCrawler(delay=2, max_pages=50, max_threads=3, use_database=False)

# 使用数据库存储

crawler = AdvancedWebCrawler(delay=2, max_pages=50, max_threads=3, use_database=True)

```

这个程序可以小编认为一个功能完整的网络爬虫框架，适用于各种网页内容抓取任务。请确保在爬取任何网站前遵守相关法律法规和网站的使用条款。

```

延伸阅读