Skip to content

Commit 94ea0cb

Browse files
authored
Update substack_simple.yml
1 parent 705b141 commit 94ea0cb

File tree

1 file changed

+82
-213
lines changed

1 file changed

+82
-213
lines changed
Lines changed: 82 additions & 213 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,20 @@
11
name: substack fetch
2+
23
on:
3-
schedule: # Run workflow automatically
4-
- cron: '0 0 * * *' # Runs once a day at midnight
5-
workflow_dispatch: # Run workflow manually through the GitHub UI
4+
schedule:
5+
- cron: '0 0 * * *' # Daily at midnight
6+
workflow_dispatch: # Manual trigger
67

78
jobs:
89
fetch-substack-posts:
910
name: Fetch latest blog posts from Substack
1011
runs-on: ubuntu-latest
12+
1113
steps:
1214
- name: Checkout repository
1315
uses: actions/checkout@v3
1416
with:
15-
ref: master # Explicitly checkout the master branch
17+
ref: master
1618

1719
- name: Set up Python
1820
uses: actions/setup-python@v4
@@ -22,7 +24,7 @@ jobs:
2224
- name: Install dependencies
2325
run: |
2426
python -m pip install --upgrade pip
25-
pip install feedparser requests beautifulsoup4 fake-useragent
27+
pip install feedparser requests beautifulsoup4 fake-useragent requests[socks] pysocks
2628
2729
- name: Run script to fetch Substack posts
2830
uses: jannekem/run-python-script-action@v1
@@ -35,224 +37,89 @@ jobs:
3537
import time
3638
from fake_useragent import UserAgent
3739
from bs4 import BeautifulSoup
38-
39-
def get_proxies():
40-
"""Get a list of free proxies from various sources"""
41-
proxy_urls = [
42-
"https://free-proxy-list.net/",
43-
"https://www.sslproxies.org/"
44-
]
45-
46-
all_proxies = []
47-
ua = UserAgent()
48-
headers = {"User-Agent": ua.random}
49-
50-
for url in proxy_urls:
51-
try:
52-
print(f"Fetching proxies from {url}")
53-
response = requests.get(url, headers=headers, timeout=10)
54-
if response.status_code == 200:
55-
soup = BeautifulSoup(response.text, 'html.parser')
56-
57-
# Common pattern across many proxy list sites
58-
table = soup.find('table')
59-
if table:
60-
for row in table.find_all('tr'):
61-
cells = row.find_all('td')
62-
if len(cells) >= 2:
63-
ip = cells[0].text.strip()
64-
port = cells[1].text.strip()
65-
if ip and port and port.isdigit():
66-
all_proxies.append(f"{ip}:{port}")
67-
except Exception as e:
68-
print(f"Error fetching proxies from {url}: {e}")
69-
70-
# Randomize and limit list
71-
random.shuffle(all_proxies)
72-
return all_proxies[:5] # Limit to 5 proxies
73-
74-
def fetch_feed_with_proxy(feed_url):
75-
"""
76-
Fetches feed using proxies and random user agents to avoid blocks.
77-
"""
78-
# Try direct connection first
79-
ua = UserAgent()
80-
headers = {"User-Agent": ua.random}
81-
82-
print(f"Trying direct connection with user agent")
40+
41+
def get_free_proxies():
42+
print("Fetching proxy list...")
8343
try:
84-
response = requests.get(feed_url, headers=headers, timeout=10)
85-
if response.status_code == 200:
86-
print("Direct connection successful!")
87-
return feedparser.parse(response.content)
44+
response = requests.get("https://free-proxy-list.net/")
45+
soup = BeautifulSoup(response.text, 'html.parser')
46+
proxy_table = soup.find('table', id='proxylisttable')
47+
proxies = []
48+
for row in proxy_table.tbody.find_all('tr'):
49+
cols = row.find_all('td')
50+
ip = cols[0].text
51+
port = cols[1].text
52+
anonymity = cols[4].text
53+
https = cols[6].text
54+
if anonymity == "elite proxy" and https == "yes":
55+
proxy = f"http://{ip}:{port}"
56+
proxies.append({ "http": proxy, "https": proxy })
57+
print(f"Found {len(proxies)} proxies.")
58+
return proxies
8859
except Exception as e:
89-
print(f"Direct connection failed: {e}")
90-
91-
# Try with proxies
92-
proxies = get_proxies()
93-
if proxies:
94-
for proxy in proxies:
95-
try:
96-
print(f"Trying proxy: {proxy}")
97-
proxy_dict = {
98-
"http": f"http://{proxy}",
99-
"https": f"http://{proxy}"
100-
}
101-
response = requests.get(feed_url, headers={"User-Agent": ua.random}, proxies=proxy_dict, timeout=15)
102-
if response.status_code == 200:
103-
print(f"Proxy fetch successful with {proxy}")
104-
return feedparser.parse(response.content)
105-
except Exception as e:
106-
print(f"Proxy fetch failed with {proxy}: {e}")
107-
108-
# Add delay between requests
109-
time.sleep(1)
110-
111-
# Try public RSS to JSON service as fallback
60+
print(f"Error fetching proxy list: {e}")
61+
return []
62+
63+
proxy_list = get_free_proxies()
64+
ua = UserAgent()
65+
66+
def get_blog_info(feed_url, num_entries=20):
67+
session = requests.Session()
68+
proxy = random.choice(proxy_list) if proxy_list else {}
69+
session.proxies.update(proxy)
70+
session.headers.update({'User-Agent': ua.random})
11271
try:
113-
print("Trying fallback method with RSS proxy...")
114-
rss_proxy_url = f"https://api.rss2json.com/v1/api.json?rss_url={feed_url}"
115-
response = requests.get(rss_proxy_url, headers={"User-Agent": ua.random}, timeout=15)
116-
if response.status_code == 200:
117-
print("Fallback method successful!")
118-
json_data = response.json()
119-
120-
# Convert JSON to feedparser format
121-
feed_data = {"entries": []}
122-
if "items" in json_data:
123-
for item in json_data["items"]:
124-
entry = {}
125-
entry["title"] = item.get("title", "")
126-
entry["link"] = item.get("link", "")
127-
feed_data["entries"].append(entry)
128-
return feed_data
129-
except Exception as e:
130-
print(f"Fallback method failed: {e}")
131-
132-
# All methods failed
133-
print("All methods failed")
134-
return {"entries": []}
135-
136-
def get_blog_info(feed_url, num_entries=None):
137-
"""
138-
Fetches blog titles and links from an RSS feed.
139-
"""
140-
feed = fetch_feed_with_proxy(feed_url)
141-
142-
entries = []
143-
if not feed or "entries" not in feed:
144-
print("No entries found in feed")
72+
response = session.get(feed_url, timeout=20)
73+
raw_feed = response.content
74+
feed = feedparser.parse(raw_feed)
75+
entries = []
76+
for entry in feed.entries[:num_entries]:
77+
entries.append({
78+
"title": entry.title,
79+
"link": entry.link
80+
})
14581
return entries
146-
147-
# Get all entries or limit if specified
148-
entries_to_process = feed["entries"] if num_entries is None else feed["entries"][:num_entries]
149-
150-
for entry in entries_to_process:
151-
title = entry.get("title", "")
152-
if hasattr(entry, "link"):
153-
link = entry.link
154-
else:
155-
link = entry.get("link", "")
156-
157-
entry_data = {
158-
"title": title,
159-
"link": link
160-
}
161-
entries.append(entry_data)
162-
163-
return entries
164-
82+
except Exception as e:
83+
print(f"Error using proxy {proxy.get('http')}: {e}")
84+
return []
85+
16586
def update_markdown_file(filename, blog_info, start_marker, end_marker):
166-
"""
167-
Updates a markdown file with blog info between specified markers.
168-
"""
169-
# Create directory if it doesn't exist
170-
os.makedirs(os.path.dirname(filename), exist_ok=True)
171-
172-
# Create file if it doesn't exist
17387
if not os.path.exists(filename):
174-
print(f"Creating file {filename} as it doesn't exist")
175-
with open(filename, 'w', encoding='utf-8') as f:
176-
f.write("# Substack Blog Posts\n\n")
177-
f.write(f"{start_marker}\n\n{end_marker}\n")
178-
179-
# Read existing content
88+
print(f"File {filename} does not exist.")
89+
return
18090
with open(filename, 'r', encoding='utf-8') as f:
18191
file_content = f.read()
182-
183-
# Find markers
184-
start_index = file_content.find(start_marker)
185-
if start_index == -1:
186-
print(f"Start marker '{start_marker}' not found, adding it")
187-
file_content += f"\n\n{start_marker}\n\n{end_marker}\n"
188-
with open(filename, 'w', encoding='utf-8') as f:
189-
f.write(file_content)
190-
191-
# Read updated content
192-
with open(filename, 'r', encoding='utf-8') as f:
193-
file_content = f.read()
194-
start_index = file_content.find(start_marker)
195-
196-
start_index += len(start_marker)
197-
end_index = file_content.find(end_marker, start_index)
198-
199-
if end_index == -1:
200-
print(f"End marker '{end_marker}' not found, adding it")
201-
file_content = file_content[:start_index] + f"\n\n{end_marker}\n"
202-
with open(filename, 'w', encoding='utf-8') as f:
203-
f.write(file_content)
204-
205-
# Read updated content
206-
with open(filename, 'r', encoding='utf-8') as f:
207-
file_content = f.read()
208-
end_index = file_content.find(end_marker, start_index)
209-
210-
# Generate new content
211-
new_content = "\n"
92+
start_index = file_content.find(start_marker) + len(start_marker)
93+
end_index = file_content.find(end_marker)
94+
new_content = ""
21295
for entry in blog_info:
21396
new_content += f"* [{entry['title']}]({entry['link']})\n"
214-
215-
# Add timestamp to force Git to detect changes
216-
new_content += f"\n<!-- Updated: {time.strftime('%Y-%m-%d %H:%M:%S')} -->\n"
217-
218-
# Update content
219-
updated_content = file_content[:start_index] + new_content + file_content[end_index:]
220-
221-
# Write updated content
97+
updated_content = file_content[:start_index] + "\n" + new_content + file_content[end_index:]
22298
with open(filename, 'w', encoding='utf-8') as f:
22399
f.write(updated_content)
224-
225-
print(f"Updated {filename} with {len(blog_info)} blog posts!")
226-
return True
227-
228-
# Main execution
229-
print("Starting Substack blog post fetcher...")
230-
231-
# Configuration
232-
FEED_URL = "https://datacommons.substack.com/feed"
233-
REPO_FILE_PATH = "docs/substack_blogs.md"
234-
START_MARKER = "<!-- START_MARKER -->"
235-
END_MARKER = "<!-- END_MARKER -->"
236-
237-
# Get blog info
238-
blog_info = get_blog_info(FEED_URL)
239-
240-
# Print the blog posts for the log
100+
print(f"Updated {filename} successfully!")
101+
102+
def fetch_with_retry(feed_url, max_retries=5):
103+
retries = 0
104+
while retries < max_retries:
105+
blog_info = get_blog_info(feed_url)
106+
if blog_info:
107+
return blog_info
108+
print(f"Retrying {retries + 1}/{max_retries}...")
109+
retries += 1
110+
time.sleep(2 ** retries)
111+
return []
112+
113+
feed_url = "https://datacommons.substack.com/feed"
114+
blog_info = fetch_with_retry(feed_url)
115+
116+
filename = "docs/substack_blogs.md"
241117
if blog_info:
242-
print(f"\n========= FOUND {len(blog_info)} SUBSTACK BLOG POSTS =========\n")
243-
244-
for i, entry in enumerate(blog_info):
245-
print(f"{i+1}. {entry.get('title', 'No title')}")
246-
print(f" URL: {entry.get('link', 'No link')}")
247-
print("")
248-
249-
# Update the markdown file
250-
success = update_markdown_file(REPO_FILE_PATH, blog_info, START_MARKER, END_MARKER)
251-
252-
if success:
253-
print(f"{REPO_FILE_PATH} updated successfully!")
118+
start_marker = "<!-- START_MARKER -->"
119+
end_marker = "<!-- END_MARKER -->"
120+
update_markdown_file(filename, blog_info, start_marker, end_marker)
254121
else:
255-
print("No blog posts found or failed to fetch the feed.")
122+
print("Failed to fetch blog info after all retries.")
256123
257124
- name: Debug file
258125
run: |
@@ -268,12 +135,14 @@ jobs:
268135
git config --local user.email "action@github.com"
269136
git config --local user.name "GitHub Action"
270137
git add docs/substack_blogs.md
271-
# Force a commit even if there are no changes
272-
git diff --staged --quiet || git commit -m "Update Substack blog links [$(date +%s)]"
273-
# Even if no changes, add a timestamp and commit
274-
if git diff --staged --quiet; then
138+
139+
# Check if any staged changes exist
140+
if ! git diff --staged --quiet; then
141+
git commit -m "Update Substack blog links [$(date +%s)]"
142+
else
275143
echo "<!-- Updated: $(date) -->" >> docs/substack_blogs.md
276144
git add docs/substack_blogs.md
277145
git commit -m "Force update timestamp [$(date +%s)]"
278146
fi
147+
279148
git push origin master

0 commit comments

Comments
 (0)