Skip to content

Commit 66bba47

Browse files
authored
Create substack_simple.yml
1 parent 64ce481 commit 66bba47

File tree

1 file changed

+279
-0
lines changed

1 file changed

+279
-0
lines changed
Lines changed: 279 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,279 @@
1+
name: Fetch Substack Blog Posts
2+
on:
3+
schedule: # Run workflow automatically
4+
- cron: '0 0 * * *' # Runs once a day at midnight
5+
workflow_dispatch: # Run workflow manually through the GitHub UI
6+
7+
jobs:
8+
fetch-substack-posts:
9+
name: Fetch latest blog posts from Substack
10+
runs-on: ubuntu-latest
11+
steps:
12+
- name: Checkout repository
13+
uses: actions/checkout@v3
14+
with:
15+
ref: master # Explicitly checkout the master branch
16+
17+
- name: Set up Python
18+
uses: actions/setup-python@v4
19+
with:
20+
python-version: '3.10'
21+
22+
- name: Install dependencies
23+
run: |
24+
python -m pip install --upgrade pip
25+
pip install feedparser requests beautifulsoup4 fake-useragent
26+
27+
- name: Run script to fetch Substack posts
28+
uses: jannekem/run-python-script-action@v1
29+
with:
30+
script: |
31+
import feedparser
32+
import requests
33+
import random
34+
import os
35+
import time
36+
from fake_useragent import UserAgent
37+
from bs4 import BeautifulSoup
38+
39+
def get_proxies():
40+
"""Get a list of free proxies from various sources"""
41+
proxy_urls = [
42+
"https://free-proxy-list.net/",
43+
"https://www.sslproxies.org/"
44+
]
45+
46+
all_proxies = []
47+
ua = UserAgent()
48+
headers = {"User-Agent": ua.random}
49+
50+
for url in proxy_urls:
51+
try:
52+
print(f"Fetching proxies from {url}")
53+
response = requests.get(url, headers=headers, timeout=10)
54+
if response.status_code == 200:
55+
soup = BeautifulSoup(response.text, 'html.parser')
56+
57+
# Common pattern across many proxy list sites
58+
table = soup.find('table')
59+
if table:
60+
for row in table.find_all('tr'):
61+
cells = row.find_all('td')
62+
if len(cells) >= 2:
63+
ip = cells[0].text.strip()
64+
port = cells[1].text.strip()
65+
if ip and port and port.isdigit():
66+
all_proxies.append(f"{ip}:{port}")
67+
except Exception as e:
68+
print(f"Error fetching proxies from {url}: {e}")
69+
70+
# Randomize and limit list
71+
random.shuffle(all_proxies)
72+
return all_proxies[:5] # Limit to 5 proxies
73+
74+
def fetch_feed_with_proxy(feed_url):
75+
"""
76+
Fetches feed using proxies and random user agents to avoid blocks.
77+
"""
78+
# Try direct connection first
79+
ua = UserAgent()
80+
headers = {"User-Agent": ua.random}
81+
82+
print(f"Trying direct connection with user agent")
83+
try:
84+
response = requests.get(feed_url, headers=headers, timeout=10)
85+
if response.status_code == 200:
86+
print("Direct connection successful!")
87+
return feedparser.parse(response.content)
88+
except Exception as e:
89+
print(f"Direct connection failed: {e}")
90+
91+
# Try with proxies
92+
proxies = get_proxies()
93+
if proxies:
94+
for proxy in proxies:
95+
try:
96+
print(f"Trying proxy: {proxy}")
97+
proxy_dict = {
98+
"http": f"http://{proxy}",
99+
"https": f"http://{proxy}"
100+
}
101+
response = requests.get(feed_url, headers={"User-Agent": ua.random}, proxies=proxy_dict, timeout=15)
102+
if response.status_code == 200:
103+
print(f"Proxy fetch successful with {proxy}")
104+
return feedparser.parse(response.content)
105+
except Exception as e:
106+
print(f"Proxy fetch failed with {proxy}: {e}")
107+
108+
# Add delay between requests
109+
time.sleep(1)
110+
111+
# Try public RSS to JSON service as fallback
112+
try:
113+
print("Trying fallback method with RSS proxy...")
114+
rss_proxy_url = f"https://api.rss2json.com/v1/api.json?rss_url={feed_url}"
115+
response = requests.get(rss_proxy_url, headers={"User-Agent": ua.random}, timeout=15)
116+
if response.status_code == 200:
117+
print("Fallback method successful!")
118+
json_data = response.json()
119+
120+
# Convert JSON to feedparser format
121+
feed_data = {"entries": []}
122+
if "items" in json_data:
123+
for item in json_data["items"]:
124+
entry = {}
125+
entry["title"] = item.get("title", "")
126+
entry["link"] = item.get("link", "")
127+
feed_data["entries"].append(entry)
128+
return feed_data
129+
except Exception as e:
130+
print(f"Fallback method failed: {e}")
131+
132+
# All methods failed
133+
print("All methods failed")
134+
return {"entries": []}
135+
136+
def get_blog_info(feed_url, num_entries=None):
137+
"""
138+
Fetches blog titles and links from an RSS feed.
139+
"""
140+
feed = fetch_feed_with_proxy(feed_url)
141+
142+
entries = []
143+
if not feed or "entries" not in feed:
144+
print("No entries found in feed")
145+
return entries
146+
147+
# Get all entries or limit if specified
148+
entries_to_process = feed["entries"] if num_entries is None else feed["entries"][:num_entries]
149+
150+
for entry in entries_to_process:
151+
title = entry.get("title", "")
152+
if hasattr(entry, "link"):
153+
link = entry.link
154+
else:
155+
link = entry.get("link", "")
156+
157+
entry_data = {
158+
"title": title,
159+
"link": link
160+
}
161+
entries.append(entry_data)
162+
163+
return entries
164+
165+
def update_markdown_file(filename, blog_info, start_marker, end_marker):
166+
"""
167+
Updates a markdown file with blog info between specified markers.
168+
"""
169+
# Create directory if it doesn't exist
170+
os.makedirs(os.path.dirname(filename), exist_ok=True)
171+
172+
# Create file if it doesn't exist
173+
if not os.path.exists(filename):
174+
print(f"Creating file {filename} as it doesn't exist")
175+
with open(filename, 'w', encoding='utf-8') as f:
176+
f.write("# Substack Blog Posts\n\n")
177+
f.write(f"{start_marker}\n\n{end_marker}\n")
178+
179+
# Read existing content
180+
with open(filename, 'r', encoding='utf-8') as f:
181+
file_content = f.read()
182+
183+
# Find markers
184+
start_index = file_content.find(start_marker)
185+
if start_index == -1:
186+
print(f"Start marker '{start_marker}' not found, adding it")
187+
file_content += f"\n\n{start_marker}\n\n{end_marker}\n"
188+
with open(filename, 'w', encoding='utf-8') as f:
189+
f.write(file_content)
190+
191+
# Read updated content
192+
with open(filename, 'r', encoding='utf-8') as f:
193+
file_content = f.read()
194+
start_index = file_content.find(start_marker)
195+
196+
start_index += len(start_marker)
197+
end_index = file_content.find(end_marker, start_index)
198+
199+
if end_index == -1:
200+
print(f"End marker '{end_marker}' not found, adding it")
201+
file_content = file_content[:start_index] + f"\n\n{end_marker}\n"
202+
with open(filename, 'w', encoding='utf-8') as f:
203+
f.write(file_content)
204+
205+
# Read updated content
206+
with open(filename, 'r', encoding='utf-8') as f:
207+
file_content = f.read()
208+
end_index = file_content.find(end_marker, start_index)
209+
210+
# Generate new content
211+
new_content = "\n"
212+
for entry in blog_info:
213+
new_content += f"* [{entry['title']}]({entry['link']})\n"
214+
215+
# Add timestamp to force Git to detect changes
216+
new_content += f"\n<!-- Updated: {time.strftime('%Y-%m-%d %H:%M:%S')} -->\n"
217+
218+
# Update content
219+
updated_content = file_content[:start_index] + new_content + file_content[end_index:]
220+
221+
# Write updated content
222+
with open(filename, 'w', encoding='utf-8') as f:
223+
f.write(updated_content)
224+
225+
print(f"Updated {filename} with {len(blog_info)} blog posts!")
226+
return True
227+
228+
# Main execution
229+
print("Starting Substack blog post fetcher...")
230+
231+
# Configuration
232+
FEED_URL = "https://datacommons.substack.com/feed"
233+
REPO_FILE_PATH = "docs/substack_blogs.md"
234+
START_MARKER = "<!-- START_MARKER -->"
235+
END_MARKER = "<!-- END_MARKER -->"
236+
237+
# Get blog info
238+
blog_info = get_blog_info(FEED_URL)
239+
240+
# Print the blog posts for the log
241+
if blog_info:
242+
print(f"\n========= FOUND {len(blog_info)} SUBSTACK BLOG POSTS =========\n")
243+
244+
for i, entry in enumerate(blog_info):
245+
print(f"{i+1}. {entry.get('title', 'No title')}")
246+
print(f" URL: {entry.get('link', 'No link')}")
247+
print("")
248+
249+
# Update the markdown file
250+
success = update_markdown_file(REPO_FILE_PATH, blog_info, START_MARKER, END_MARKER)
251+
252+
if success:
253+
print(f"{REPO_FILE_PATH} updated successfully!")
254+
else:
255+
print("No blog posts found or failed to fetch the feed.")
256+
257+
- name: Debug file
258+
run: |
259+
if [ -f docs/substack_blogs.md ]; then
260+
echo "File content:"
261+
cat docs/substack_blogs.md
262+
else
263+
echo "File does not exist"
264+
fi
265+
266+
- name: Commit changes
267+
run: |
268+
git config --local user.email "action@github.com"
269+
git config --local user.name "GitHub Action"
270+
git add docs/substack_blogs.md
271+
# Force a commit even if there are no changes
272+
git diff --staged --quiet || git commit -m "Update Substack blog links [$(date +%s)]"
273+
# Even if no changes, add a timestamp and commit
274+
if git diff --staged --quiet; then
275+
echo "<!-- Updated: $(date) -->" >> docs/substack_blogs.md
276+
git add docs/substack_blogs.md
277+
git commit -m "Force update timestamp [$(date +%s)]"
278+
fi
279+
git push origin master

0 commit comments

Comments
 (0)