Skip to content

Commit 382f3eb

Browse files
authored
Update substack_simple.yml
1 parent e970f23 commit 382f3eb

File tree

1 file changed

+197
-46
lines changed

1 file changed

+197
-46
lines changed

.github/workflows/substack_simple.yml

Lines changed: 197 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ jobs:
2424
- name: Install dependencies
2525
run: |
2626
python -m pip install --upgrade pip
27-
pip install feedparser requests beautifulsoup4 fake-useragent requests[socks] pysocks
27+
pip install feedparser requests beautifulsoup4 fake-useragent
2828
2929
- name: Run script to fetch Substack posts
3030
uses: jannekem/run-python-script-action@v1
@@ -37,101 +37,252 @@ jobs:
3737
import time
3838
from fake_useragent import UserAgent
3939
40-
def get_geonode_proxies():
41-
print("Fetching proxies from Geonode...")
42-
try:
43-
url = "https://proxylist.geonode.com/api/proxy-list?limit=500&page=1&sort_by=lastChecked&sort_type=desc"
44-
resp = requests.get(url, timeout=20)
45-
data = resp.json()
46-
proxies = []
47-
for proxy in data.get("data", []):
48-
if proxy.get("anonymityLevel") == "elite" and "socks4" in proxy.get("protocols", []):
49-
ip = proxy["ip"]
50-
port = proxy["port"]
51-
proxies.append({
52-
"http": f"socks4://{ip}:{port}",
53-
"https": f"socks4://{ip}:{port}"
54-
})
55-
print(f"Loaded {len(proxies)} proxies.")
56-
return proxies
57-
except Exception as e:
58-
print(f"Failed to fetch proxy list: {e}")
59-
return []
60-
61-
proxy_list = get_geonode_proxies()
6240
ua = UserAgent()
6341
64-
def get_blog_info(feed_url, num_entries=20):
65-
session = requests.Session()
66-
proxy = random.choice(proxy_list) if proxy_list else {}
67-
session.proxies.update(proxy)
68-
session.headers.update({'User-Agent': ua.random})
42+
def get_blog_info_direct(feed_url, num_entries=20):
43+
"""Direct approach without proxies - most reliable for GitHub Actions"""
44+
print("Attempting direct connection...")
6945
try:
70-
response = session.get(feed_url, timeout=20)
71-
raw_feed = response.content
72-
feed = feedparser.parse(raw_feed)
46+
headers = {
47+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
48+
'Accept': 'application/rss+xml, application/xml, text/xml',
49+
'Accept-Language': 'en-US,en;q=0.9',
50+
'Accept-Encoding': 'gzip, deflate',
51+
'Connection': 'keep-alive',
52+
'Upgrade-Insecure-Requests': '1',
53+
}
54+
55+
session = requests.Session()
56+
session.headers.update(headers)
57+
58+
response = session.get(feed_url, timeout=30)
59+
response.raise_for_status()
60+
61+
feed = feedparser.parse(response.content)
62+
63+
if not feed.entries:
64+
print("No entries found in feed")
65+
return []
66+
7367
entries = []
7468
for entry in feed.entries[:num_entries]:
7569
entries.append({
7670
"title": entry.title,
7771
"link": entry.link
7872
})
73+
74+
print(f"Successfully fetched {len(entries)} entries")
7975
return entries
76+
77+
except requests.exceptions.RequestException as e:
78+
print(f"Direct request failed: {e}")
79+
return []
8080
except Exception as e:
81-
print(f"Error fetching with proxy {proxy.get('http')}: {e}")
81+
print(f"Unexpected error: {e}")
8282
return []
8383
84+
def get_blog_info_with_rotation(feed_url, num_entries=20):
85+
"""Fallback with user agent rotation"""
86+
print("Trying with user agent rotation...")
87+
88+
user_agents = [
89+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
90+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
91+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
92+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0',
93+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/121.0'
94+
]
95+
96+
for attempt, user_agent in enumerate(user_agents, 1):
97+
try:
98+
print(f"Attempt {attempt} with user agent: {user_agent[:50]}...")
99+
100+
headers = {
101+
'User-Agent': user_agent,
102+
'Accept': 'application/rss+xml, application/xml, text/xml, */*',
103+
'Accept-Language': 'en-US,en;q=0.5',
104+
'Accept-Encoding': 'gzip, deflate',
105+
'Connection': 'keep-alive',
106+
'Cache-Control': 'no-cache',
107+
'Pragma': 'no-cache'
108+
}
109+
110+
session = requests.Session()
111+
session.headers.update(headers)
112+
113+
response = session.get(feed_url, timeout=30)
114+
response.raise_for_status()
115+
116+
feed = feedparser.parse(response.content)
117+
118+
if feed.entries:
119+
entries = []
120+
for entry in feed.entries[:num_entries]:
121+
entries.append({
122+
"title": entry.title,
123+
"link": entry.link
124+
})
125+
print(f"Success! Fetched {len(entries)} entries")
126+
return entries
127+
128+
except Exception as e:
129+
print(f"Attempt {attempt} failed: {e}")
130+
if attempt < len(user_agents):
131+
time.sleep(2)
132+
continue
133+
134+
return []
135+
136+
def get_blog_info_alternative_endpoint(feed_url, num_entries=20):
137+
"""Try alternative RSS endpoints"""
138+
print("Trying alternative endpoints...")
139+
140+
# Extract base URL and try different RSS endpoints
141+
base_url = feed_url.replace('/feed', '').replace('/rss', '')
142+
alternative_urls = [
143+
f"{base_url}/feed",
144+
f"{base_url}/rss",
145+
f"{base_url}/feed.xml",
146+
f"{base_url}/rss.xml",
147+
feed_url # original
148+
]
149+
150+
headers = {
151+
'User-Agent': 'Mozilla/5.0 (compatible; RSS Reader/1.0)',
152+
'Accept': 'application/rss+xml, application/xml, text/xml',
153+
}
154+
155+
for url in alternative_urls:
156+
try:
157+
print(f"Trying endpoint: {url}")
158+
response = requests.get(url, headers=headers, timeout=30)
159+
response.raise_for_status()
160+
161+
feed = feedparser.parse(response.content)
162+
163+
if feed.entries:
164+
entries = []
165+
for entry in feed.entries[:num_entries]:
166+
entries.append({
167+
"title": entry.title,
168+
"link": entry.link
169+
})
170+
print(f"Success with {url}! Fetched {len(entries)} entries")
171+
return entries
172+
173+
except Exception as e:
174+
print(f"Failed with {url}: {e}")
175+
continue
176+
177+
return []
178+
84179
def update_markdown_file(filename, blog_info, start_marker, end_marker):
85180
if not os.path.exists(filename):
86-
print(f"File {filename} not found.")
87-
return
181+
print(f"File {filename} not found. Creating directory structure...")
182+
os.makedirs(os.path.dirname(filename), exist_ok=True)
183+
# Create a basic file structure
184+
with open(filename, 'w', encoding='utf-8') as f:
185+
f.write(f"# Substack Blogs\n\n{start_marker}\n\n{end_marker}\n")
186+
88187
with open(filename, 'r', encoding='utf-8') as f:
89188
content = f.read()
90-
start_idx = content.find(start_marker) + len(start_marker)
189+
190+
start_idx = content.find(start_marker)
91191
end_idx = content.find(end_marker)
192+
193+
if start_idx == -1 or end_idx == -1:
194+
print("Markers not found in file. Adding them...")
195+
content += f"\n\n{start_marker}\n\n{end_marker}\n"
196+
start_idx = content.find(start_marker)
197+
end_idx = content.find(end_marker)
198+
199+
start_idx += len(start_marker)
200+
92201
new_section = "\n".join(f"* [{item['title']}]({item['link']})" for item in blog_info)
93202
updated = content[:start_idx] + "\n" + new_section + "\n" + content[end_idx:]
203+
94204
with open(filename, 'w', encoding='utf-8') as f:
95205
f.write(updated)
96206
print(f"Updated {filename}")
97207
98-
def fetch_with_retry(feed_url, max_retries=5):
99-
for i in range(max_retries):
100-
result = get_blog_info(feed_url)
101-
if result:
102-
return result
103-
print(f"Retry {i+1}/{max_retries}")
104-
time.sleep(2 ** i)
208+
def fetch_with_multiple_strategies(feed_url, max_retries=3):
209+
"""Try multiple strategies to fetch the RSS feed"""
210+
strategies = [
211+
get_blog_info_direct,
212+
get_blog_info_with_rotation,
213+
get_blog_info_alternative_endpoint
214+
]
215+
216+
for strategy_num, strategy in enumerate(strategies, 1):
217+
print(f"\n=== Strategy {strategy_num}: {strategy.__name__} ===")
218+
219+
for retry in range(max_retries):
220+
try:
221+
result = strategy(feed_url)
222+
if result:
223+
print(f"✅ Success with strategy {strategy_num}!")
224+
return result
225+
else:
226+
print(f"Strategy {strategy_num}, attempt {retry + 1} returned no results")
227+
228+
except Exception as e:
229+
print(f"Strategy {strategy_num}, attempt {retry + 1} failed: {e}")
230+
231+
if retry < max_retries - 1:
232+
wait_time = 2 ** retry
233+
print(f"Waiting {wait_time} seconds before retry...")
234+
time.sleep(wait_time)
235+
236+
print(f"❌ Strategy {strategy_num} failed after {max_retries} attempts")
237+
105238
return []
106239
240+
# Main execution
107241
feed_url = "https://datacommons.substack.com/feed"
108-
blog_info = fetch_with_retry(feed_url)
242+
print(f"Starting fetch for: {feed_url}")
243+
244+
blog_info = fetch_with_multiple_strategies(feed_url)
109245
110246
filename = "docs/substack_blogs.md"
111247
if blog_info:
248+
print(f"\n✅ Successfully fetched {len(blog_info)} blog posts!")
112249
update_markdown_file(filename, blog_info, "<!-- START_MARKER -->", "<!-- END_MARKER -->")
113250
else:
114-
print("Failed to fetch blog info after retries.")
251+
print("\n❌ Failed to fetch blog info with all strategies.")
252+
print("Creating placeholder entry...")
253+
# Create a placeholder entry so the workflow doesn't completely fail
254+
placeholder_info = [{
255+
"title": f"Failed to fetch posts - {time.strftime('%Y-%m-%d %H:%M:%S')}",
256+
"link": feed_url
257+
}]
258+
update_markdown_file(filename, placeholder_info, "<!-- START_MARKER -->", "<!-- END_MARKER -->")
115259
116260
- name: Debug file
117261
run: |
118262
if [ -f docs/substack_blogs.md ]; then
119-
echo "File content:"
263+
echo "File exists. Content:"
120264
cat docs/substack_blogs.md
121265
else
122266
echo "File does not exist"
267+
ls -la docs/ || echo "docs directory doesn't exist"
123268
fi
124269
125270
- name: Commit changes
126271
run: |
127272
git config --local user.email "action@github.com"
128273
git config --local user.name "GitHub Action"
274+
275+
# Create docs directory if it doesn't exist
276+
mkdir -p docs
277+
129278
git add docs/substack_blogs.md
130279
131-
# Commit if file changed
280+
# Check if there are changes to commit
132281
if ! git diff --staged --quiet; then
133282
git commit -m "Update Substack blog links [$(date +%s)]"
283+
echo "Changes committed"
134284
else
285+
echo "No changes to commit, adding timestamp..."
135286
echo "<!-- Updated: $(date) -->" >> docs/substack_blogs.md
136287
git add docs/substack_blogs.md
137288
git commit -m "Force update timestamp [$(date +%s)]"

0 commit comments

Comments
 (0)