Update substack_simple.yml

samapriya · web-flow · commit 382f3eb6684e · 2025-06-25T00:35:05.000-05:00
diff --git a/.github/workflows/substack_simple.yml b/.github/workflows/substack_simple.yml
@@ -24,7 +24,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install feedparser requests beautifulsoup4 fake-useragent requests[socks] pysocks
+          pip install feedparser requests beautifulsoup4 fake-useragent
 
       - name: Run script to fetch Substack posts
         uses: jannekem/run-python-script-action@v1
@@ -37,101 +37,252 @@ jobs:
             import time
             from fake_useragent import UserAgent
 
-            def get_geonode_proxies():
-                print("Fetching proxies from Geonode...")
-                try:
-                    url = "https://proxylist.geonode.com/api/proxy-list?limit=500&page=1&sort_by=lastChecked&sort_type=desc"
-                    resp = requests.get(url, timeout=20)
-                    data = resp.json()
-                    proxies = []
-                    for proxy in data.get("data", []):
-                        if proxy.get("anonymityLevel") == "elite" and "socks4" in proxy.get("protocols", []):
-                            ip = proxy["ip"]
-                            port = proxy["port"]
-                            proxies.append({
-                                "http": f"socks4://{ip}:{port}",
-                                "https": f"socks4://{ip}:{port}"
-                            })
-                    print(f"Loaded {len(proxies)} proxies.")
-                    return proxies
-                except Exception as e:
-                    print(f"Failed to fetch proxy list: {e}")
-                    return []
-
-            proxy_list = get_geonode_proxies()
             ua = UserAgent()
 
-            def get_blog_info(feed_url, num_entries=20):
-                session = requests.Session()
-                proxy = random.choice(proxy_list) if proxy_list else {}
-                session.proxies.update(proxy)
-                session.headers.update({'User-Agent': ua.random})
+            def get_blog_info_direct(feed_url, num_entries=20):
+                """Direct approach without proxies - most reliable for GitHub Actions"""
+                print("Attempting direct connection...")
                 try:
-                    response = session.get(feed_url, timeout=20)
-                    raw_feed = response.content
-                    feed = feedparser.parse(raw_feed)
+                    headers = {
+                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+                        'Accept': 'application/rss+xml, application/xml, text/xml',
+                        'Accept-Language': 'en-US,en;q=0.9',
+                        'Accept-Encoding': 'gzip, deflate',
+                        'Connection': 'keep-alive',
+                        'Upgrade-Insecure-Requests': '1',
+                    }
+                    
+                    session = requests.Session()
+                    session.headers.update(headers)
+                    
+                    response = session.get(feed_url, timeout=30)
+                    response.raise_for_status()
+                    
+                    feed = feedparser.parse(response.content)
+                    
+                    if not feed.entries:
+                        print("No entries found in feed")
+                        return []
+                    
                     entries = []
                     for entry in feed.entries[:num_entries]:
                         entries.append({
                             "title": entry.title,
                             "link": entry.link
                         })
+                    
+                    print(f"Successfully fetched {len(entries)} entries")
                     return entries
+                    
+                except requests.exceptions.RequestException as e:
+                    print(f"Direct request failed: {e}")
+                    return []
                 except Exception as e:
-                    print(f"Error fetching with proxy {proxy.get('http')}: {e}")
+                    print(f"Unexpected error: {e}")
                     return []
 
+            def get_blog_info_with_rotation(feed_url, num_entries=20):
+                """Fallback with user agent rotation"""
+                print("Trying with user agent rotation...")
+                
+                user_agents = [
+                    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+                    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+                    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+                    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0',
+                    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/121.0'
+                ]
+                
+                for attempt, user_agent in enumerate(user_agents, 1):
+                    try:
+                        print(f"Attempt {attempt} with user agent: {user_agent[:50]}...")
+                        
+                        headers = {
+                            'User-Agent': user_agent,
+                            'Accept': 'application/rss+xml, application/xml, text/xml, */*',
+                            'Accept-Language': 'en-US,en;q=0.5',
+                            'Accept-Encoding': 'gzip, deflate',
+                            'Connection': 'keep-alive',
+                            'Cache-Control': 'no-cache',
+                            'Pragma': 'no-cache'
+                        }
+                        
+                        session = requests.Session()
+                        session.headers.update(headers)
+                        
+                        response = session.get(feed_url, timeout=30)
+                        response.raise_for_status()
+                        
+                        feed = feedparser.parse(response.content)
+                        
+                        if feed.entries:
+                            entries = []
+                            for entry in feed.entries[:num_entries]:
+                                entries.append({
+                                    "title": entry.title,
+                                    "link": entry.link
+                                })
+                            print(f"Success! Fetched {len(entries)} entries")
+                            return entries
+                        
+                    except Exception as e:
+                        print(f"Attempt {attempt} failed: {e}")
+                        if attempt < len(user_agents):
+                            time.sleep(2)
+                        continue
+                
+                return []
+
+            def get_blog_info_alternative_endpoint(feed_url, num_entries=20):
+                """Try alternative RSS endpoints"""
+                print("Trying alternative endpoints...")
+                
+                # Extract base URL and try different RSS endpoints
+                base_url = feed_url.replace('/feed', '').replace('/rss', '')
+                alternative_urls = [
+                    f"{base_url}/feed",
+                    f"{base_url}/rss",
+                    f"{base_url}/feed.xml",
+                    f"{base_url}/rss.xml",
+                    feed_url  # original
+                ]
+                
+                headers = {
+                    'User-Agent': 'Mozilla/5.0 (compatible; RSS Reader/1.0)',
+                    'Accept': 'application/rss+xml, application/xml, text/xml',
+                }
+                
+                for url in alternative_urls:
+                    try:
+                        print(f"Trying endpoint: {url}")
+                        response = requests.get(url, headers=headers, timeout=30)
+                        response.raise_for_status()
+                        
+                        feed = feedparser.parse(response.content)
+                        
+                        if feed.entries:
+                            entries = []
+                            for entry in feed.entries[:num_entries]:
+                                entries.append({
+                                    "title": entry.title,
+                                    "link": entry.link
+                                })
+                            print(f"Success with {url}! Fetched {len(entries)} entries")
+                            return entries
+                            
+                    except Exception as e:
+                        print(f"Failed with {url}: {e}")
+                        continue
+                
+                return []
+
             def update_markdown_file(filename, blog_info, start_marker, end_marker):
                 if not os.path.exists(filename):
-                    print(f"File {filename} not found.")
-                    return
+                    print(f"File {filename} not found. Creating directory structure...")
+                    os.makedirs(os.path.dirname(filename), exist_ok=True)
+                    # Create a basic file structure
+                    with open(filename, 'w', encoding='utf-8') as f:
+                        f.write(f"# Substack Blogs\n\n{start_marker}\n\n{end_marker}\n")
+                
                 with open(filename, 'r', encoding='utf-8') as f:
                     content = f.read()
-                start_idx = content.find(start_marker) + len(start_marker)
+                
+                start_idx = content.find(start_marker)
                 end_idx = content.find(end_marker)
+                
+                if start_idx == -1 or end_idx == -1:
+                    print("Markers not found in file. Adding them...")
+                    content += f"\n\n{start_marker}\n\n{end_marker}\n"
+                    start_idx = content.find(start_marker)
+                    end_idx = content.find(end_marker)
+                
+                start_idx += len(start_marker)
+                
                 new_section = "\n".join(f"* [{item['title']}]({item['link']})" for item in blog_info)
                 updated = content[:start_idx] + "\n" + new_section + "\n" + content[end_idx:]
+                
                 with open(filename, 'w', encoding='utf-8') as f:
                     f.write(updated)
                 print(f"Updated {filename}")
 
-            def fetch_with_retry(feed_url, max_retries=5):
-                for i in range(max_retries):
-                    result = get_blog_info(feed_url)
-                    if result:
-                        return result
-                    print(f"Retry {i+1}/{max_retries}")
-                    time.sleep(2 ** i)
+            def fetch_with_multiple_strategies(feed_url, max_retries=3):
+                """Try multiple strategies to fetch the RSS feed"""
+                strategies = [
+                    get_blog_info_direct,
+                    get_blog_info_with_rotation,
+                    get_blog_info_alternative_endpoint
+                ]
+                
+                for strategy_num, strategy in enumerate(strategies, 1):
+                    print(f"\n=== Strategy {strategy_num}: {strategy.__name__} ===")
+                    
+                    for retry in range(max_retries):
+                        try:
+                            result = strategy(feed_url)
+                            if result:
+                                print(f"✅ Success with strategy {strategy_num}!")
+                                return result
+                            else:
+                                print(f"Strategy {strategy_num}, attempt {retry + 1} returned no results")
+                                
+                        except Exception as e:
+                            print(f"Strategy {strategy_num}, attempt {retry + 1} failed: {e}")
+                        
+                        if retry < max_retries - 1:
+                            wait_time = 2 ** retry
+                            print(f"Waiting {wait_time} seconds before retry...")
+                            time.sleep(wait_time)
+                    
+                    print(f"❌ Strategy {strategy_num} failed after {max_retries} attempts")
+                
                 return []
 
+            # Main execution
             feed_url = "https://datacommons.substack.com/feed"
-            blog_info = fetch_with_retry(feed_url)
+            print(f"Starting fetch for: {feed_url}")
+            
+            blog_info = fetch_with_multiple_strategies(feed_url)
 
             filename = "docs/substack_blogs.md"
             if blog_info:
+                print(f"\n✅ Successfully fetched {len(blog_info)} blog posts!")
                 update_markdown_file(filename, blog_info, "<!-- START_MARKER -->", "<!-- END_MARKER -->")
             else:
-                print("Failed to fetch blog info after retries.")
+                print("\n❌ Failed to fetch blog info with all strategies.")
+                print("Creating placeholder entry...")
+                # Create a placeholder entry so the workflow doesn't completely fail
+                placeholder_info = [{
+                    "title": f"Failed to fetch posts - {time.strftime('%Y-%m-%d %H:%M:%S')}",
+                    "link": feed_url
+                }]
+                update_markdown_file(filename, placeholder_info, "<!-- START_MARKER -->", "<!-- END_MARKER -->")
 
       - name: Debug file
         run: |
           if [ -f docs/substack_blogs.md ]; then
-            echo "File content:"
+            echo "File exists. Content:"
             cat docs/substack_blogs.md
           else
             echo "File does not exist"
+            ls -la docs/ || echo "docs directory doesn't exist"
           fi
 
       - name: Commit changes
         run: |
           git config --local user.email "action@github.com"
           git config --local user.name "GitHub Action"
+          
+          # Create docs directory if it doesn't exist
+          mkdir -p docs
+          
           git add docs/substack_blogs.md
 
-          # Commit if file changed
+          # Check if there are changes to commit
           if ! git diff --staged --quiet; then
             git commit -m "Update Substack blog links [$(date +%s)]"
+            echo "Changes committed"
           else
+            echo "No changes to commit, adding timestamp..."
             echo "<!-- Updated: $(date) -->" >> docs/substack_blogs.md
             git add docs/substack_blogs.md
             git commit -m "Force update timestamp [$(date +%s)]"