|
| 1 | +name: Fetch Substack Blog Posts |
| 2 | +on: |
| 3 | + schedule: # Run workflow automatically |
| 4 | + - cron: '0 0 * * *' # Runs once a day at midnight |
| 5 | + workflow_dispatch: # Run workflow manually through the GitHub UI |
| 6 | + |
| 7 | +jobs: |
| 8 | + fetch-substack-posts: |
| 9 | + name: Fetch latest blog posts from Substack |
| 10 | + runs-on: ubuntu-latest |
| 11 | + steps: |
| 12 | + - name: Checkout repository |
| 13 | + uses: actions/checkout@v3 |
| 14 | + with: |
| 15 | + ref: master # Explicitly checkout the master branch |
| 16 | + |
| 17 | + - name: Set up Python |
| 18 | + uses: actions/setup-python@v4 |
| 19 | + with: |
| 20 | + python-version: '3.10' |
| 21 | + |
| 22 | + - name: Install dependencies |
| 23 | + run: | |
| 24 | + python -m pip install --upgrade pip |
| 25 | + pip install feedparser requests beautifulsoup4 fake-useragent |
| 26 | +
|
| 27 | + - name: Run script to fetch Substack posts |
| 28 | + uses: jannekem/run-python-script-action@v1 |
| 29 | + with: |
| 30 | + script: | |
| 31 | + import feedparser |
| 32 | + import requests |
| 33 | + import random |
| 34 | + import os |
| 35 | + import time |
| 36 | + from fake_useragent import UserAgent |
| 37 | + from bs4 import BeautifulSoup |
| 38 | + |
| 39 | + def get_proxies(): |
| 40 | + """Get a list of free proxies from various sources""" |
| 41 | + proxy_urls = [ |
| 42 | + "https://free-proxy-list.net/", |
| 43 | + "https://www.sslproxies.org/" |
| 44 | + ] |
| 45 | + |
| 46 | + all_proxies = [] |
| 47 | + ua = UserAgent() |
| 48 | + headers = {"User-Agent": ua.random} |
| 49 | + |
| 50 | + for url in proxy_urls: |
| 51 | + try: |
| 52 | + print(f"Fetching proxies from {url}") |
| 53 | + response = requests.get(url, headers=headers, timeout=10) |
| 54 | + if response.status_code == 200: |
| 55 | + soup = BeautifulSoup(response.text, 'html.parser') |
| 56 | + |
| 57 | + # Common pattern across many proxy list sites |
| 58 | + table = soup.find('table') |
| 59 | + if table: |
| 60 | + for row in table.find_all('tr'): |
| 61 | + cells = row.find_all('td') |
| 62 | + if len(cells) >= 2: |
| 63 | + ip = cells[0].text.strip() |
| 64 | + port = cells[1].text.strip() |
| 65 | + if ip and port and port.isdigit(): |
| 66 | + all_proxies.append(f"{ip}:{port}") |
| 67 | + except Exception as e: |
| 68 | + print(f"Error fetching proxies from {url}: {e}") |
| 69 | + |
| 70 | + # Randomize and limit list |
| 71 | + random.shuffle(all_proxies) |
| 72 | + return all_proxies[:5] # Limit to 5 proxies |
| 73 | + |
| 74 | + def fetch_feed_with_proxy(feed_url): |
| 75 | + """ |
| 76 | + Fetches feed using proxies and random user agents to avoid blocks. |
| 77 | + """ |
| 78 | + # Try direct connection first |
| 79 | + ua = UserAgent() |
| 80 | + headers = {"User-Agent": ua.random} |
| 81 | + |
| 82 | + print(f"Trying direct connection with user agent") |
| 83 | + try: |
| 84 | + response = requests.get(feed_url, headers=headers, timeout=10) |
| 85 | + if response.status_code == 200: |
| 86 | + print("Direct connection successful!") |
| 87 | + return feedparser.parse(response.content) |
| 88 | + except Exception as e: |
| 89 | + print(f"Direct connection failed: {e}") |
| 90 | + |
| 91 | + # Try with proxies |
| 92 | + proxies = get_proxies() |
| 93 | + if proxies: |
| 94 | + for proxy in proxies: |
| 95 | + try: |
| 96 | + print(f"Trying proxy: {proxy}") |
| 97 | + proxy_dict = { |
| 98 | + "http": f"http://{proxy}", |
| 99 | + "https": f"http://{proxy}" |
| 100 | + } |
| 101 | + response = requests.get(feed_url, headers={"User-Agent": ua.random}, proxies=proxy_dict, timeout=15) |
| 102 | + if response.status_code == 200: |
| 103 | + print(f"Proxy fetch successful with {proxy}") |
| 104 | + return feedparser.parse(response.content) |
| 105 | + except Exception as e: |
| 106 | + print(f"Proxy fetch failed with {proxy}: {e}") |
| 107 | + |
| 108 | + # Add delay between requests |
| 109 | + time.sleep(1) |
| 110 | + |
| 111 | + # Try public RSS to JSON service as fallback |
| 112 | + try: |
| 113 | + print("Trying fallback method with RSS proxy...") |
| 114 | + rss_proxy_url = f"https://api.rss2json.com/v1/api.json?rss_url={feed_url}" |
| 115 | + response = requests.get(rss_proxy_url, headers={"User-Agent": ua.random}, timeout=15) |
| 116 | + if response.status_code == 200: |
| 117 | + print("Fallback method successful!") |
| 118 | + json_data = response.json() |
| 119 | + |
| 120 | + # Convert JSON to feedparser format |
| 121 | + feed_data = {"entries": []} |
| 122 | + if "items" in json_data: |
| 123 | + for item in json_data["items"]: |
| 124 | + entry = {} |
| 125 | + entry["title"] = item.get("title", "") |
| 126 | + entry["link"] = item.get("link", "") |
| 127 | + feed_data["entries"].append(entry) |
| 128 | + return feed_data |
| 129 | + except Exception as e: |
| 130 | + print(f"Fallback method failed: {e}") |
| 131 | + |
| 132 | + # All methods failed |
| 133 | + print("All methods failed") |
| 134 | + return {"entries": []} |
| 135 | + |
| 136 | + def get_blog_info(feed_url, num_entries=None): |
| 137 | + """ |
| 138 | + Fetches blog titles and links from an RSS feed. |
| 139 | + """ |
| 140 | + feed = fetch_feed_with_proxy(feed_url) |
| 141 | + |
| 142 | + entries = [] |
| 143 | + if not feed or "entries" not in feed: |
| 144 | + print("No entries found in feed") |
| 145 | + return entries |
| 146 | + |
| 147 | + # Get all entries or limit if specified |
| 148 | + entries_to_process = feed["entries"] if num_entries is None else feed["entries"][:num_entries] |
| 149 | + |
| 150 | + for entry in entries_to_process: |
| 151 | + title = entry.get("title", "") |
| 152 | + if hasattr(entry, "link"): |
| 153 | + link = entry.link |
| 154 | + else: |
| 155 | + link = entry.get("link", "") |
| 156 | + |
| 157 | + entry_data = { |
| 158 | + "title": title, |
| 159 | + "link": link |
| 160 | + } |
| 161 | + entries.append(entry_data) |
| 162 | + |
| 163 | + return entries |
| 164 | + |
| 165 | + def update_markdown_file(filename, blog_info, start_marker, end_marker): |
| 166 | + """ |
| 167 | + Updates a markdown file with blog info between specified markers. |
| 168 | + """ |
| 169 | + # Create directory if it doesn't exist |
| 170 | + os.makedirs(os.path.dirname(filename), exist_ok=True) |
| 171 | + |
| 172 | + # Create file if it doesn't exist |
| 173 | + if not os.path.exists(filename): |
| 174 | + print(f"Creating file {filename} as it doesn't exist") |
| 175 | + with open(filename, 'w', encoding='utf-8') as f: |
| 176 | + f.write("# Substack Blog Posts\n\n") |
| 177 | + f.write(f"{start_marker}\n\n{end_marker}\n") |
| 178 | + |
| 179 | + # Read existing content |
| 180 | + with open(filename, 'r', encoding='utf-8') as f: |
| 181 | + file_content = f.read() |
| 182 | + |
| 183 | + # Find markers |
| 184 | + start_index = file_content.find(start_marker) |
| 185 | + if start_index == -1: |
| 186 | + print(f"Start marker '{start_marker}' not found, adding it") |
| 187 | + file_content += f"\n\n{start_marker}\n\n{end_marker}\n" |
| 188 | + with open(filename, 'w', encoding='utf-8') as f: |
| 189 | + f.write(file_content) |
| 190 | + |
| 191 | + # Read updated content |
| 192 | + with open(filename, 'r', encoding='utf-8') as f: |
| 193 | + file_content = f.read() |
| 194 | + start_index = file_content.find(start_marker) |
| 195 | + |
| 196 | + start_index += len(start_marker) |
| 197 | + end_index = file_content.find(end_marker, start_index) |
| 198 | + |
| 199 | + if end_index == -1: |
| 200 | + print(f"End marker '{end_marker}' not found, adding it") |
| 201 | + file_content = file_content[:start_index] + f"\n\n{end_marker}\n" |
| 202 | + with open(filename, 'w', encoding='utf-8') as f: |
| 203 | + f.write(file_content) |
| 204 | + |
| 205 | + # Read updated content |
| 206 | + with open(filename, 'r', encoding='utf-8') as f: |
| 207 | + file_content = f.read() |
| 208 | + end_index = file_content.find(end_marker, start_index) |
| 209 | + |
| 210 | + # Generate new content |
| 211 | + new_content = "\n" |
| 212 | + for entry in blog_info: |
| 213 | + new_content += f"* [{entry['title']}]({entry['link']})\n" |
| 214 | + |
| 215 | + # Add timestamp to force Git to detect changes |
| 216 | + new_content += f"\n<!-- Updated: {time.strftime('%Y-%m-%d %H:%M:%S')} -->\n" |
| 217 | + |
| 218 | + # Update content |
| 219 | + updated_content = file_content[:start_index] + new_content + file_content[end_index:] |
| 220 | + |
| 221 | + # Write updated content |
| 222 | + with open(filename, 'w', encoding='utf-8') as f: |
| 223 | + f.write(updated_content) |
| 224 | + |
| 225 | + print(f"Updated {filename} with {len(blog_info)} blog posts!") |
| 226 | + return True |
| 227 | + |
| 228 | + # Main execution |
| 229 | + print("Starting Substack blog post fetcher...") |
| 230 | + |
| 231 | + # Configuration |
| 232 | + FEED_URL = "https://datacommons.substack.com/feed" |
| 233 | + REPO_FILE_PATH = "docs/substack_blogs.md" |
| 234 | + START_MARKER = "<!-- START_MARKER -->" |
| 235 | + END_MARKER = "<!-- END_MARKER -->" |
| 236 | + |
| 237 | + # Get blog info |
| 238 | + blog_info = get_blog_info(FEED_URL) |
| 239 | + |
| 240 | + # Print the blog posts for the log |
| 241 | + if blog_info: |
| 242 | + print(f"\n========= FOUND {len(blog_info)} SUBSTACK BLOG POSTS =========\n") |
| 243 | + |
| 244 | + for i, entry in enumerate(blog_info): |
| 245 | + print(f"{i+1}. {entry.get('title', 'No title')}") |
| 246 | + print(f" URL: {entry.get('link', 'No link')}") |
| 247 | + print("") |
| 248 | + |
| 249 | + # Update the markdown file |
| 250 | + success = update_markdown_file(REPO_FILE_PATH, blog_info, START_MARKER, END_MARKER) |
| 251 | + |
| 252 | + if success: |
| 253 | + print(f"{REPO_FILE_PATH} updated successfully!") |
| 254 | + else: |
| 255 | + print("No blog posts found or failed to fetch the feed.") |
| 256 | +
|
| 257 | + - name: Debug file |
| 258 | + run: | |
| 259 | + if [ -f docs/substack_blogs.md ]; then |
| 260 | + echo "File content:" |
| 261 | + cat docs/substack_blogs.md |
| 262 | + else |
| 263 | + echo "File does not exist" |
| 264 | + fi |
| 265 | +
|
| 266 | + - name: Commit changes |
| 267 | + run: | |
| 268 | + git config --local user.email "action@github.com" |
| 269 | + git config --local user.name "GitHub Action" |
| 270 | + git add docs/substack_blogs.md |
| 271 | + # Force a commit even if there are no changes |
| 272 | + git diff --staged --quiet || git commit -m "Update Substack blog links [$(date +%s)]" |
| 273 | + # Even if no changes, add a timestamp and commit |
| 274 | + if git diff --staged --quiet; then |
| 275 | + echo "<!-- Updated: $(date) -->" >> docs/substack_blogs.md |
| 276 | + git add docs/substack_blogs.md |
| 277 | + git commit -m "Force update timestamp [$(date +%s)]" |
| 278 | + fi |
| 279 | + git push origin master |
0 commit comments