Skip to content

substack fetch

substack fetch #8

name: substack fetch
on:
schedule:
- cron: '0 0 * * *' # Daily at midnight
workflow_dispatch:
jobs:
fetch-substack-posts:
name: Fetch latest blog posts from Substack
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v3
with:
ref: master
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install feedparser requests beautifulsoup4 fake-useragent requests[socks] pysocks
- name: Run script to fetch Substack posts
uses: jannekem/run-python-script-action@v1
with:
script: |
import feedparser
import requests
import random
import os
import time
from fake_useragent import UserAgent
def get_geonode_proxies():
print("Fetching proxies from Geonode...")
try:
url = "https://proxylist.geonode.com/api/proxy-list?limit=500&page=1&sort_by=lastChecked&sort_type=desc"
resp = requests.get(url, timeout=20)
data = resp.json()
proxies = []
for proxy in data.get("data", []):
if proxy.get("anonymityLevel") == "elite" and "socks4" in proxy.get("protocols", []):
ip = proxy["ip"]
port = proxy["port"]
proxies.append({
"http": f"socks4://{ip}:{port}",
"https": f"socks4://{ip}:{port}"
})
print(f"Loaded {len(proxies)} proxies.")
return proxies
except Exception as e:
print(f"Failed to fetch proxy list: {e}")
return []
proxy_list = get_geonode_proxies()
ua = UserAgent()
def get_blog_info(feed_url, num_entries=20):
session = requests.Session()
proxy = random.choice(proxy_list) if proxy_list else {}
session.proxies.update(proxy)
session.headers.update({'User-Agent': ua.random})
try:
response = session.get(feed_url, timeout=20)
raw_feed = response.content
feed = feedparser.parse(raw_feed)
entries = []
for entry in feed.entries[:num_entries]:
entries.append({
"title": entry.title,
"link": entry.link
})
return entries
except Exception as e:
print(f"Error fetching with proxy {proxy.get('http')}: {e}")
return []
def update_markdown_file(filename, blog_info, start_marker, end_marker):
if not os.path.exists(filename):
print(f"File {filename} not found.")
return
with open(filename, 'r', encoding='utf-8') as f:
content = f.read()
start_idx = content.find(start_marker) + len(start_marker)
end_idx = content.find(end_marker)
new_section = "\n".join(f"* [{item['title']}]({item['link']})" for item in blog_info)
updated = content[:start_idx] + "\n" + new_section + "\n" + content[end_idx:]
with open(filename, 'w', encoding='utf-8') as f:
f.write(updated)
print(f"Updated {filename}")
def fetch_with_retry(feed_url, max_retries=5):
for i in range(max_retries):
result = get_blog_info(feed_url)
if result:
return result
print(f"Retry {i+1}/{max_retries}")
time.sleep(2 ** i)
return []
feed_url = "https://datacommons.substack.com/feed"
blog_info = fetch_with_retry(feed_url)
filename = "docs/substack_blogs.md"
if blog_info:
update_markdown_file(filename, blog_info, "<!-- START_MARKER -->", "<!-- END_MARKER -->")
else:
print("Failed to fetch blog info after retries.")
- name: Debug file
run: |
if [ -f docs/substack_blogs.md ]; then
echo "File content:"
cat docs/substack_blogs.md
else
echo "File does not exist"
fi
- name: Commit changes
run: |
git config --local user.email "action@github.com"
git config --local user.name "GitHub Action"
git add docs/substack_blogs.md
# Commit if file changed
if ! git diff --staged --quiet; then
git commit -m "Update Substack blog links [$(date +%s)]"
else
echo "<!-- Updated: $(date) -->" >> docs/substack_blogs.md
git add docs/substack_blogs.md
git commit -m "Force update timestamp [$(date +%s)]"
fi
git push origin master