substack fetch #9
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: substack fetch | |
| on: | |
| schedule: | |
| - cron: '0 0 * * *' # Daily at midnight | |
| workflow_dispatch: | |
| jobs: | |
| fetch-substack-posts: | |
| name: Fetch latest blog posts from Substack | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v3 | |
| with: | |
| ref: master | |
| - name: Set up Python | |
| uses: actions/setup-python@v4 | |
| with: | |
| python-version: '3.10' | |
| - name: Install dependencies | |
| run: | | |
| python -m pip install --upgrade pip | |
| pip install feedparser requests beautifulsoup4 fake-useragent requests[socks] pysocks | |
| - name: Run script to fetch Substack posts | |
| uses: jannekem/run-python-script-action@v1 | |
| with: | |
| script: | | |
| import feedparser | |
| import requests | |
| import random | |
| import os | |
| import time | |
| from fake_useragent import UserAgent | |
| def get_geonode_proxies(): | |
| print("Fetching proxies from Geonode...") | |
| try: | |
| url = "https://proxylist.geonode.com/api/proxy-list?limit=500&page=1&sort_by=lastChecked&sort_type=desc" | |
| resp = requests.get(url, timeout=20) | |
| data = resp.json() | |
| proxies = [] | |
| for proxy in data.get("data", []): | |
| if proxy.get("anonymityLevel") == "elite" and "socks4" in proxy.get("protocols", []): | |
| ip = proxy["ip"] | |
| port = proxy["port"] | |
| proxies.append({ | |
| "http": f"socks4://{ip}:{port}", | |
| "https": f"socks4://{ip}:{port}" | |
| }) | |
| print(f"Loaded {len(proxies)} proxies.") | |
| return proxies | |
| except Exception as e: | |
| print(f"Failed to fetch proxy list: {e}") | |
| return [] | |
| proxy_list = get_geonode_proxies() | |
| ua = UserAgent() | |
| def get_blog_info(feed_url, num_entries=20): | |
| session = requests.Session() | |
| proxy = random.choice(proxy_list) if proxy_list else {} | |
| session.proxies.update(proxy) | |
| session.headers.update({'User-Agent': ua.random}) | |
| try: | |
| response = session.get(feed_url, timeout=20) | |
| raw_feed = response.content | |
| feed = feedparser.parse(raw_feed) | |
| entries = [] | |
| for entry in feed.entries[:num_entries]: | |
| entries.append({ | |
| "title": entry.title, | |
| "link": entry.link | |
| }) | |
| return entries | |
| except Exception as e: | |
| print(f"Error fetching with proxy {proxy.get('http')}: {e}") | |
| return [] | |
| def update_markdown_file(filename, blog_info, start_marker, end_marker): | |
| if not os.path.exists(filename): | |
| print(f"File {filename} not found.") | |
| return | |
| with open(filename, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| start_idx = content.find(start_marker) + len(start_marker) | |
| end_idx = content.find(end_marker) | |
| new_section = "\n".join(f"* [{item['title']}]({item['link']})" for item in blog_info) | |
| updated = content[:start_idx] + "\n" + new_section + "\n" + content[end_idx:] | |
| with open(filename, 'w', encoding='utf-8') as f: | |
| f.write(updated) | |
| print(f"Updated {filename}") | |
| def fetch_with_retry(feed_url, max_retries=5): | |
| for i in range(max_retries): | |
| result = get_blog_info(feed_url) | |
| if result: | |
| return result | |
| print(f"Retry {i+1}/{max_retries}") | |
| time.sleep(2 ** i) | |
| return [] | |
| feed_url = "https://datacommons.substack.com/feed" | |
| blog_info = fetch_with_retry(feed_url) | |
| filename = "docs/substack_blogs.md" | |
| if blog_info: | |
| update_markdown_file(filename, blog_info, "<!-- START_MARKER -->", "<!-- END_MARKER -->") | |
| else: | |
| print("Failed to fetch blog info after retries.") | |
| - name: Debug file | |
| run: | | |
| if [ -f docs/substack_blogs.md ]; then | |
| echo "File content:" | |
| cat docs/substack_blogs.md | |
| else | |
| echo "File does not exist" | |
| fi | |
| - name: Commit changes | |
| run: | | |
| git config --local user.email "action@github.com" | |
| git config --local user.name "GitHub Action" | |
| git add docs/substack_blogs.md | |
| # Commit if file changed | |
| if ! git diff --staged --quiet; then | |
| git commit -m "Update Substack blog links [$(date +%s)]" | |
| else | |
| echo "<!-- Updated: $(date) -->" >> docs/substack_blogs.md | |
| git add docs/substack_blogs.md | |
| git commit -m "Force update timestamp [$(date +%s)]" | |
| fi | |
| git push origin master |