11name : substack fetch
2+
23on :
3- schedule : # Run workflow automatically
4- - cron : ' 0 0 * * *' # Runs once a day at midnight
5- workflow_dispatch : # Run workflow manually through the GitHub UI
4+ schedule :
5+ - cron : ' 0 0 * * *' # Daily at midnight
6+ workflow_dispatch : # Manual trigger
67
78jobs :
89 fetch-substack-posts :
910 name : Fetch latest blog posts from Substack
1011 runs-on : ubuntu-latest
12+
1113 steps :
1214 - name : Checkout repository
1315 uses : actions/checkout@v3
1416 with :
15- ref : master # Explicitly checkout the master branch
17+ ref : master
1618
1719 - name : Set up Python
1820 uses : actions/setup-python@v4
2224 - name : Install dependencies
2325 run : |
2426 python -m pip install --upgrade pip
25- pip install feedparser requests beautifulsoup4 fake-useragent
27+ pip install feedparser requests beautifulsoup4 fake-useragent requests[socks] pysocks
2628
2729 - name : Run script to fetch Substack posts
2830 uses : jannekem/run-python-script-action@v1
@@ -35,224 +37,89 @@ jobs:
3537 import time
3638 from fake_useragent import UserAgent
3739 from bs4 import BeautifulSoup
38-
39- def get_proxies():
40- """Get a list of free proxies from various sources"""
41- proxy_urls = [
42- "https://free-proxy-list.net/",
43- "https://www.sslproxies.org/"
44- ]
45-
46- all_proxies = []
47- ua = UserAgent()
48- headers = {"User-Agent": ua.random}
49-
50- for url in proxy_urls:
51- try:
52- print(f"Fetching proxies from {url}")
53- response = requests.get(url, headers=headers, timeout=10)
54- if response.status_code == 200:
55- soup = BeautifulSoup(response.text, 'html.parser')
56-
57- # Common pattern across many proxy list sites
58- table = soup.find('table')
59- if table:
60- for row in table.find_all('tr'):
61- cells = row.find_all('td')
62- if len(cells) >= 2:
63- ip = cells[0].text.strip()
64- port = cells[1].text.strip()
65- if ip and port and port.isdigit():
66- all_proxies.append(f"{ip}:{port}")
67- except Exception as e:
68- print(f"Error fetching proxies from {url}: {e}")
69-
70- # Randomize and limit list
71- random.shuffle(all_proxies)
72- return all_proxies[:5] # Limit to 5 proxies
73-
74- def fetch_feed_with_proxy(feed_url):
75- """
76- Fetches feed using proxies and random user agents to avoid blocks.
77- """
78- # Try direct connection first
79- ua = UserAgent()
80- headers = {"User-Agent": ua.random}
81-
82- print(f"Trying direct connection with user agent")
40+
41+ def get_free_proxies():
42+ print("Fetching proxy list...")
8343 try:
84- response = requests.get(feed_url, headers=headers, timeout=10)
85- if response.status_code == 200:
86- print("Direct connection successful!")
87- return feedparser.parse(response.content)
44+ response = requests.get("https://free-proxy-list.net/")
45+ soup = BeautifulSoup(response.text, 'html.parser')
46+ proxy_table = soup.find('table', id='proxylisttable')
47+ proxies = []
48+ for row in proxy_table.tbody.find_all('tr'):
49+ cols = row.find_all('td')
50+ ip = cols[0].text
51+ port = cols[1].text
52+ anonymity = cols[4].text
53+ https = cols[6].text
54+ if anonymity == "elite proxy" and https == "yes":
55+ proxy = f"http://{ip}:{port}"
56+ proxies.append({ "http": proxy, "https": proxy })
57+ print(f"Found {len(proxies)} proxies.")
58+ return proxies
8859 except Exception as e:
89- print(f"Direct connection failed: {e}")
90-
91- # Try with proxies
92- proxies = get_proxies()
93- if proxies:
94- for proxy in proxies:
95- try:
96- print(f"Trying proxy: {proxy}")
97- proxy_dict = {
98- "http": f"http://{proxy}",
99- "https": f"http://{proxy}"
100- }
101- response = requests.get(feed_url, headers={"User-Agent": ua.random}, proxies=proxy_dict, timeout=15)
102- if response.status_code == 200:
103- print(f"Proxy fetch successful with {proxy}")
104- return feedparser.parse(response.content)
105- except Exception as e:
106- print(f"Proxy fetch failed with {proxy}: {e}")
107-
108- # Add delay between requests
109- time.sleep(1)
110-
111- # Try public RSS to JSON service as fallback
60+ print(f"Error fetching proxy list: {e}")
61+ return []
62+
63+ proxy_list = get_free_proxies()
64+ ua = UserAgent()
65+
66+ def get_blog_info(feed_url, num_entries=20):
67+ session = requests.Session()
68+ proxy = random.choice(proxy_list) if proxy_list else {}
69+ session.proxies.update(proxy)
70+ session.headers.update({'User-Agent': ua.random})
11271 try:
113- print("Trying fallback method with RSS proxy...")
114- rss_proxy_url = f"https://api.rss2json.com/v1/api.json?rss_url={feed_url}"
115- response = requests.get(rss_proxy_url, headers={"User-Agent": ua.random}, timeout=15)
116- if response.status_code == 200:
117- print("Fallback method successful!")
118- json_data = response.json()
119-
120- # Convert JSON to feedparser format
121- feed_data = {"entries": []}
122- if "items" in json_data:
123- for item in json_data["items"]:
124- entry = {}
125- entry["title"] = item.get("title", "")
126- entry["link"] = item.get("link", "")
127- feed_data["entries"].append(entry)
128- return feed_data
129- except Exception as e:
130- print(f"Fallback method failed: {e}")
131-
132- # All methods failed
133- print("All methods failed")
134- return {"entries": []}
135-
136- def get_blog_info(feed_url, num_entries=None):
137- """
138- Fetches blog titles and links from an RSS feed.
139- """
140- feed = fetch_feed_with_proxy(feed_url)
141-
142- entries = []
143- if not feed or "entries" not in feed:
144- print("No entries found in feed")
72+ response = session.get(feed_url, timeout=20)
73+ raw_feed = response.content
74+ feed = feedparser.parse(raw_feed)
75+ entries = []
76+ for entry in feed.entries[:num_entries]:
77+ entries.append({
78+ "title": entry.title,
79+ "link": entry.link
80+ })
14581 return entries
146-
147- # Get all entries or limit if specified
148- entries_to_process = feed["entries"] if num_entries is None else feed["entries"][:num_entries]
149-
150- for entry in entries_to_process:
151- title = entry.get("title", "")
152- if hasattr(entry, "link"):
153- link = entry.link
154- else:
155- link = entry.get("link", "")
156-
157- entry_data = {
158- "title": title,
159- "link": link
160- }
161- entries.append(entry_data)
162-
163- return entries
164-
82+ except Exception as e:
83+ print(f"Error using proxy {proxy.get('http')}: {e}")
84+ return []
85+
16586 def update_markdown_file(filename, blog_info, start_marker, end_marker):
166- """
167- Updates a markdown file with blog info between specified markers.
168- """
169- # Create directory if it doesn't exist
170- os.makedirs(os.path.dirname(filename), exist_ok=True)
171-
172- # Create file if it doesn't exist
17387 if not os.path.exists(filename):
174- print(f"Creating file {filename} as it doesn't exist")
175- with open(filename, 'w', encoding='utf-8') as f:
176- f.write("# Substack Blog Posts\n\n")
177- f.write(f"{start_marker}\n\n{end_marker}\n")
178-
179- # Read existing content
88+ print(f"File {filename} does not exist.")
89+ return
18090 with open(filename, 'r', encoding='utf-8') as f:
18191 file_content = f.read()
182-
183- # Find markers
184- start_index = file_content.find(start_marker)
185- if start_index == -1:
186- print(f"Start marker '{start_marker}' not found, adding it")
187- file_content += f"\n\n{start_marker}\n\n{end_marker}\n"
188- with open(filename, 'w', encoding='utf-8') as f:
189- f.write(file_content)
190-
191- # Read updated content
192- with open(filename, 'r', encoding='utf-8') as f:
193- file_content = f.read()
194- start_index = file_content.find(start_marker)
195-
196- start_index += len(start_marker)
197- end_index = file_content.find(end_marker, start_index)
198-
199- if end_index == -1:
200- print(f"End marker '{end_marker}' not found, adding it")
201- file_content = file_content[:start_index] + f"\n\n{end_marker}\n"
202- with open(filename, 'w', encoding='utf-8') as f:
203- f.write(file_content)
204-
205- # Read updated content
206- with open(filename, 'r', encoding='utf-8') as f:
207- file_content = f.read()
208- end_index = file_content.find(end_marker, start_index)
209-
210- # Generate new content
211- new_content = "\n"
92+ start_index = file_content.find(start_marker) + len(start_marker)
93+ end_index = file_content.find(end_marker)
94+ new_content = ""
21295 for entry in blog_info:
21396 new_content += f"* [{entry['title']}]({entry['link']})\n"
214-
215- # Add timestamp to force Git to detect changes
216- new_content += f"\n<!-- Updated: {time.strftime('%Y-%m-%d %H:%M:%S')} -->\n"
217-
218- # Update content
219- updated_content = file_content[:start_index] + new_content + file_content[end_index:]
220-
221- # Write updated content
97+ updated_content = file_content[:start_index] + "\n" + new_content + file_content[end_index:]
22298 with open(filename, 'w', encoding='utf-8') as f:
22399 f.write(updated_content)
224-
225- print(f"Updated {filename} with {len(blog_info)} blog posts!")
226- return True
227-
228- # Main execution
229- print("Starting Substack blog post fetcher..." )
230-
231- # Configuration
232- FEED_URL = "https://datacommons.substack.com/feed"
233- REPO_FILE_PATH = "docs/substack_blogs.md"
234- START_MARKER = "<!-- START_MARKER -->"
235- END_MARKER = "<!-- END_MARKER -->"
236-
237- # Get blog info
238- blog_info = get_blog_info(FEED_URL )
239-
240- # Print the blog posts for the log
100+ print(f"Updated {filename} successfully!")
101+
102+ def fetch_with_retry(feed_url, max_retries=5):
103+ retries = 0
104+ while retries < max_retries:
105+ blog_info = get_blog_info(feed_url )
106+ if blog_info:
107+ return blog_info
108+ print(f"Retrying {retries + 1}/{max_retries}...")
109+ retries += 1
110+ time.sleep(2 ** retries)
111+ return []
112+
113+ feed_url = "https://datacommons.substack.com/feed"
114+ blog_info = fetch_with_retry(feed_url )
115+
116+ filename = "docs/substack_blogs.md"
241117 if blog_info:
242- print(f"\n========= FOUND {len(blog_info)} SUBSTACK BLOG POSTS =========\n")
243-
244- for i, entry in enumerate(blog_info):
245- print(f"{i+1}. {entry.get('title', 'No title')}")
246- print(f" URL: {entry.get('link', 'No link')}")
247- print("")
248-
249- # Update the markdown file
250- success = update_markdown_file(REPO_FILE_PATH, blog_info, START_MARKER, END_MARKER)
251-
252- if success:
253- print(f"{REPO_FILE_PATH} updated successfully!")
118+ start_marker = "<!-- START_MARKER -->"
119+ end_marker = "<!-- END_MARKER -->"
120+ update_markdown_file(filename, blog_info, start_marker, end_marker)
254121 else:
255- print("No blog posts found or failed to fetch the feed .")
122+ print("Failed to fetch blog info after all retries .")
256123
257124 - name : Debug file
258125 run : |
@@ -268,12 +135,14 @@ jobs:
268135 git config --local user.email "action@github.com"
269136 git config --local user.name "GitHub Action"
270137 git add docs/substack_blogs.md
271- # Force a commit even if there are no changes
272- git diff --staged --quiet || git commit -m "Update Substack blog links [$(date +%s)]"
273- # Even if no changes, add a timestamp and commit
274- if git diff --staged --quiet; then
138+
139+ # Check if any staged changes exist
140+ if ! git diff --staged --quiet; then
141+ git commit -m "Update Substack blog links [$(date +%s)]"
142+ else
275143 echo "<!-- Updated: $(date) -->" >> docs/substack_blogs.md
276144 git add docs/substack_blogs.md
277145 git commit -m "Force update timestamp [$(date +%s)]"
278146 fi
147+
279148 git push origin master
0 commit comments