GPT生成爬虫
import requests
from bs4 import BeautifulSoup
import telebot
import time
# 创建电报机器人
bot = telebot.TeleBot('YOUR_TELEGRAM_BOT_TOKEN')
# 定义多个目标网址和关键词列表
websites = {
'site1': {
'url': 'https://rss.nodeseek.com/',
'keywords': ["出鸡", "出"]
},
'site2': {
'url': 'https://example.com',
'keywords': ["keyword1", "keyword2"]
}
}
# 电报用户ID
user_id = 填写ID # 将其替换为您要发送消息的电报用户ID
def get_content(url):
"""
获取网页内容
"""
response = requests.get(url)
soup = BeautifulSoup(response.content, features="xml")
return soup
def extract_articles(soup, keywords):
"""
提取包含关键词的文章
"""
articles = []
for article in soup.find_all('article'):
title = article.find('h2').text
for keyword in keywords:
if keyword in title:
articles.append(article)
break
return articles
def format_article(article):
"""
格式化文章内容
"""
title = article.find('h2').text
link = article.find('a')['href']
return f"{title}\n{link}"
def send_articles(articles, user_id):
"""
向指定电报用户发送文章
"""
for article in articles:
message = format_article(article)
bot.send_message(user_id, message)
# 间隔3秒再发送下一篇文章
time.sleep(3)
if __name__ == '__main__':
while True:
for site, info in websites.items():
# 获取网页内容
soup = get_content(info['url'])
print(f"网站 {site} 内容获取成功!")
# 提取包含关键词的文章
articles = extract_articles(soup, info['keywords'])
print(f"网站 {site} 共找到 {len(articles)} 篇包含关键词的文章!")
# 向指定电报用户发送文章
send_articles(articles, user_id)
print(f"网站 {site} 文章发送成功!")
# 暂停一段时间再执行下一次爬取
time.sleep(3600) # 暂停1小时(3600秒)