2424 - name : Install dependencies
2525 run : |
2626 python -m pip install --upgrade pip
27- pip install feedparser requests beautifulsoup4 fake-useragent requests[socks] pysocks
27+ pip install feedparser requests beautifulsoup4 fake-useragent
2828
2929 - name : Run script to fetch Substack posts
3030 uses : jannekem/run-python-script-action@v1
@@ -37,101 +37,252 @@ jobs:
3737 import time
3838 from fake_useragent import UserAgent
3939
40- def get_geonode_proxies():
41- print("Fetching proxies from Geonode...")
42- try:
43- url = "https://proxylist.geonode.com/api/proxy-list?limit=500&page=1&sort_by=lastChecked&sort_type=desc"
44- resp = requests.get(url, timeout=20)
45- data = resp.json()
46- proxies = []
47- for proxy in data.get("data", []):
48- if proxy.get("anonymityLevel") == "elite" and "socks4" in proxy.get("protocols", []):
49- ip = proxy["ip"]
50- port = proxy["port"]
51- proxies.append({
52- "http": f"socks4://{ip}:{port}",
53- "https": f"socks4://{ip}:{port}"
54- })
55- print(f"Loaded {len(proxies)} proxies.")
56- return proxies
57- except Exception as e:
58- print(f"Failed to fetch proxy list: {e}")
59- return []
60-
61- proxy_list = get_geonode_proxies()
6240 ua = UserAgent()
6341
64- def get_blog_info(feed_url, num_entries=20):
65- session = requests.Session()
66- proxy = random.choice(proxy_list) if proxy_list else {}
67- session.proxies.update(proxy)
68- session.headers.update({'User-Agent': ua.random})
42+ def get_blog_info_direct(feed_url, num_entries=20):
43+ """Direct approach without proxies - most reliable for GitHub Actions"""
44+ print("Attempting direct connection...")
6945 try:
70- response = session.get(feed_url, timeout=20)
71- raw_feed = response.content
72- feed = feedparser.parse(raw_feed)
46+ headers = {
47+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
48+ 'Accept': 'application/rss+xml, application/xml, text/xml',
49+ 'Accept-Language': 'en-US,en;q=0.9',
50+ 'Accept-Encoding': 'gzip, deflate',
51+ 'Connection': 'keep-alive',
52+ 'Upgrade-Insecure-Requests': '1',
53+ }
54+
55+ session = requests.Session()
56+ session.headers.update(headers)
57+
58+ response = session.get(feed_url, timeout=30)
59+ response.raise_for_status()
60+
61+ feed = feedparser.parse(response.content)
62+
63+ if not feed.entries:
64+ print("No entries found in feed")
65+ return []
66+
7367 entries = []
7468 for entry in feed.entries[:num_entries]:
7569 entries.append({
7670 "title": entry.title,
7771 "link": entry.link
7872 })
73+
74+ print(f"Successfully fetched {len(entries)} entries")
7975 return entries
76+
77+ except requests.exceptions.RequestException as e:
78+ print(f"Direct request failed: {e}")
79+ return []
8080 except Exception as e:
81- print(f"Error fetching with proxy {proxy.get('http')} : {e}")
81+ print(f"Unexpected error : {e}")
8282 return []
8383
84+ def get_blog_info_with_rotation(feed_url, num_entries=20):
85+ """Fallback with user agent rotation"""
86+ print("Trying with user agent rotation...")
87+
88+ user_agents = [
89+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
90+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
91+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
92+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0',
93+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/121.0'
94+ ]
95+
96+ for attempt, user_agent in enumerate(user_agents, 1):
97+ try:
98+ print(f"Attempt {attempt} with user agent: {user_agent[:50]}...")
99+
100+ headers = {
101+ 'User-Agent': user_agent,
102+ 'Accept': 'application/rss+xml, application/xml, text/xml, */*',
103+ 'Accept-Language': 'en-US,en;q=0.5',
104+ 'Accept-Encoding': 'gzip, deflate',
105+ 'Connection': 'keep-alive',
106+ 'Cache-Control': 'no-cache',
107+ 'Pragma': 'no-cache'
108+ }
109+
110+ session = requests.Session()
111+ session.headers.update(headers)
112+
113+ response = session.get(feed_url, timeout=30)
114+ response.raise_for_status()
115+
116+ feed = feedparser.parse(response.content)
117+
118+ if feed.entries:
119+ entries = []
120+ for entry in feed.entries[:num_entries]:
121+ entries.append({
122+ "title": entry.title,
123+ "link": entry.link
124+ })
125+ print(f"Success! Fetched {len(entries)} entries")
126+ return entries
127+
128+ except Exception as e:
129+ print(f"Attempt {attempt} failed: {e}")
130+ if attempt < len(user_agents):
131+ time.sleep(2)
132+ continue
133+
134+ return []
135+
136+ def get_blog_info_alternative_endpoint(feed_url, num_entries=20):
137+ """Try alternative RSS endpoints"""
138+ print("Trying alternative endpoints...")
139+
140+ # Extract base URL and try different RSS endpoints
141+ base_url = feed_url.replace('/feed', '').replace('/rss', '')
142+ alternative_urls = [
143+ f"{base_url}/feed",
144+ f"{base_url}/rss",
145+ f"{base_url}/feed.xml",
146+ f"{base_url}/rss.xml",
147+ feed_url # original
148+ ]
149+
150+ headers = {
151+ 'User-Agent': 'Mozilla/5.0 (compatible; RSS Reader/1.0)',
152+ 'Accept': 'application/rss+xml, application/xml, text/xml',
153+ }
154+
155+ for url in alternative_urls:
156+ try:
157+ print(f"Trying endpoint: {url}")
158+ response = requests.get(url, headers=headers, timeout=30)
159+ response.raise_for_status()
160+
161+ feed = feedparser.parse(response.content)
162+
163+ if feed.entries:
164+ entries = []
165+ for entry in feed.entries[:num_entries]:
166+ entries.append({
167+ "title": entry.title,
168+ "link": entry.link
169+ })
170+ print(f"Success with {url}! Fetched {len(entries)} entries")
171+ return entries
172+
173+ except Exception as e:
174+ print(f"Failed with {url}: {e}")
175+ continue
176+
177+ return []
178+
84179 def update_markdown_file(filename, blog_info, start_marker, end_marker):
85180 if not os.path.exists(filename):
86- print(f"File {filename} not found.")
87- return
181+ print(f"File {filename} not found. Creating directory structure...")
182+ os.makedirs(os.path.dirname(filename), exist_ok=True)
183+ # Create a basic file structure
184+ with open(filename, 'w', encoding='utf-8') as f:
185+ f.write(f"# Substack Blogs\n\n{start_marker}\n\n{end_marker}\n")
186+
88187 with open(filename, 'r', encoding='utf-8') as f:
89188 content = f.read()
90- start_idx = content.find(start_marker) + len(start_marker)
189+
190+ start_idx = content.find(start_marker)
91191 end_idx = content.find(end_marker)
192+
193+ if start_idx == -1 or end_idx == -1:
194+ print("Markers not found in file. Adding them...")
195+ content += f"\n\n{start_marker}\n\n{end_marker}\n"
196+ start_idx = content.find(start_marker)
197+ end_idx = content.find(end_marker)
198+
199+ start_idx += len(start_marker)
200+
92201 new_section = "\n".join(f"* [{item['title']}]({item['link']})" for item in blog_info)
93202 updated = content[:start_idx] + "\n" + new_section + "\n" + content[end_idx:]
203+
94204 with open(filename, 'w', encoding='utf-8') as f:
95205 f.write(updated)
96206 print(f"Updated {filename}")
97207
98- def fetch_with_retry(feed_url, max_retries=5):
99- for i in range(max_retries):
100- result = get_blog_info(feed_url)
101- if result:
102- return result
103- print(f"Retry {i+1}/{max_retries}")
104- time.sleep(2 ** i)
208+ def fetch_with_multiple_strategies(feed_url, max_retries=3):
209+ """Try multiple strategies to fetch the RSS feed"""
210+ strategies = [
211+ get_blog_info_direct,
212+ get_blog_info_with_rotation,
213+ get_blog_info_alternative_endpoint
214+ ]
215+
216+ for strategy_num, strategy in enumerate(strategies, 1):
217+ print(f"\n=== Strategy {strategy_num}: {strategy.__name__} ===")
218+
219+ for retry in range(max_retries):
220+ try:
221+ result = strategy(feed_url)
222+ if result:
223+ print(f"✅ Success with strategy {strategy_num}!")
224+ return result
225+ else:
226+ print(f"Strategy {strategy_num}, attempt {retry + 1} returned no results")
227+
228+ except Exception as e:
229+ print(f"Strategy {strategy_num}, attempt {retry + 1} failed: {e}")
230+
231+ if retry < max_retries - 1:
232+ wait_time = 2 ** retry
233+ print(f"Waiting {wait_time} seconds before retry...")
234+ time.sleep(wait_time)
235+
236+ print(f"❌ Strategy {strategy_num} failed after {max_retries} attempts")
237+
105238 return []
106239
240+ # Main execution
107241 feed_url = "https://datacommons.substack.com/feed"
108- blog_info = fetch_with_retry(feed_url)
242+ print(f"Starting fetch for: {feed_url}")
243+
244+ blog_info = fetch_with_multiple_strategies(feed_url)
109245
110246 filename = "docs/substack_blogs.md"
111247 if blog_info:
248+ print(f"\n✅ Successfully fetched {len(blog_info)} blog posts!")
112249 update_markdown_file(filename, blog_info, "<!-- START_MARKER -->", "<!-- END_MARKER -->")
113250 else:
114- print("Failed to fetch blog info after retries.")
251+ print("\n❌ Failed to fetch blog info with all strategies.")
252+ print("Creating placeholder entry...")
253+ # Create a placeholder entry so the workflow doesn't completely fail
254+ placeholder_info = [{
255+ "title": f"Failed to fetch posts - {time.strftime('%Y-%m-%d %H:%M:%S')}",
256+ "link": feed_url
257+ }]
258+ update_markdown_file(filename, placeholder_info, "<!-- START_MARKER -->", "<!-- END_MARKER -->")
115259
116260 - name : Debug file
117261 run : |
118262 if [ -f docs/substack_blogs.md ]; then
119- echo "File content :"
263+ echo "File exists. Content :"
120264 cat docs/substack_blogs.md
121265 else
122266 echo "File does not exist"
267+ ls -la docs/ || echo "docs directory doesn't exist"
123268 fi
124269
125270 - name : Commit changes
126271 run : |
127272 git config --local user.email "action@github.com"
128273 git config --local user.name "GitHub Action"
274+
275+ # Create docs directory if it doesn't exist
276+ mkdir -p docs
277+
129278 git add docs/substack_blogs.md
130279
131- # Commit if file changed
280+ # Check if there are changes to commit
132281 if ! git diff --staged --quiet; then
133282 git commit -m "Update Substack blog links [$(date +%s)]"
283+ echo "Changes committed"
134284 else
285+ echo "No changes to commit, adding timestamp..."
135286 echo "<!-- Updated: $(date) -->" >> docs/substack_blogs.md
136287 git add docs/substack_blogs.md
137288 git commit -m "Force update timestamp [$(date +%s)]"
0 commit comments