Jofthomas commited on
Commit
ab03104
·
1 Parent(s): a6c6ed7
Files changed (1) hide show
  1. app.py +56 -34
app.py CHANGED
@@ -1,6 +1,8 @@
1
  from __future__ import annotations
2
 
3
  import os
 
 
4
  from typing import List, Optional, Literal
5
 
6
  import httpx
@@ -58,38 +60,53 @@ def _fetch_subreddit_new(subreddit: str, limit: int) -> list[dict]:
58
  children = payload.get("data", {}).get("children", [])
59
  print(f"Reddit fetch source: JSON API ({len(children)} items)")
60
  return [child.get("data", {}) for child in children]
61
- except Exception:
62
- # RSS fallback
63
- feed_url = f"https://www.reddit.com/r/{subreddit}/new/.rss"
64
- feed = feedparser.parse(feed_url)
65
- posts: list[dict] = []
66
- for entry in feed.entries[:limit]:
67
- # Attempt to extract id and score if present (RSS is limited)
68
- link = entry.get("link") or ""
69
- title = entry.get("title") or ""
70
- # created: use published_parsed if available
71
- created_utc = 0.0
72
- if getattr(entry, "published_parsed", None):
73
- try:
74
- import calendar
75
-
76
- created_utc = float(calendar.timegm(entry.published_parsed))
77
- except Exception:
78
- created_utc = 0.0
79
- posts.append(
80
- {
81
- "title": title,
82
- "selftext": "",
83
- "score": 0,
84
- "created_utc": created_utc,
85
- "id": entry.get("id") or "",
86
- "permalink": "",
87
- "url": link,
88
- "link_flair_text": None,
89
- }
90
- )
91
- print(f"Reddit fetch source: RSS fallback ({len(posts)} items)")
92
- return posts
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
 
95
  def _get_mistral_client() -> Mistral:
@@ -171,9 +188,12 @@ def scan_mistralai_pain_points(limit: int = 50, min_score: int = 0) -> List[Pain
171
  for post in raw_posts:
172
  title = post.get("title", "").strip()
173
  selftext = post.get("selftext", "") or ""
174
- score = int(post.get("score", 0) or 0)
 
175
 
176
- if score < min_score:
 
 
177
  continue
178
 
179
  try:
@@ -184,6 +204,7 @@ def scan_mistralai_pain_points(limit: int = 50, min_score: int = 0) -> List[Pain
184
  continue
185
 
186
  if not should:
 
187
  continue
188
 
189
  try:
@@ -210,6 +231,7 @@ def scan_mistralai_pain_points(limit: int = 50, min_score: int = 0) -> List[Pain
210
  flair=post.get("link_flair_text"),
211
  )
212
  )
 
213
 
214
  print(f"Extraction complete: {len(pain_points)} pain points")
215
  return pain_points
 
1
  from __future__ import annotations
2
 
3
  import os
4
+ import re
5
+ import html
6
  from typing import List, Optional, Literal
7
 
8
  import httpx
 
60
  children = payload.get("data", {}).get("children", [])
61
  print(f"Reddit fetch source: JSON API ({len(children)} items)")
62
  return [child.get("data", {}) for child in children]
63
+ except Exception as e:
64
+ print(f"Reddit JSON fetch failed: {e}; trying api.reddit.com")
65
+ try:
66
+ api_url = f"https://api.reddit.com/r/{subreddit}/new?limit={limit}"
67
+ with httpx.Client(timeout=httpx.Timeout(15.0), headers=headers) as client:
68
+ response = client.get(api_url, follow_redirects=True)
69
+ response.raise_for_status()
70
+ payload = response.json()
71
+ children = payload.get("data", {}).get("children", [])
72
+ print(f"Reddit fetch source: API domain ({len(children)} items)")
73
+ return [child.get("data", {}) for child in children]
74
+ except Exception as e2:
75
+ # RSS fallback
76
+ print(f"Reddit API fetch failed: {e2}; switching to RSS fallback")
77
+ feed_url = f"https://www.reddit.com/r/{subreddit}/new/.rss"
78
+ feed = feedparser.parse(feed_url)
79
+ posts: list[dict] = []
80
+ for entry in feed.entries[:limit]:
81
+ link = entry.get("link") or ""
82
+ title = entry.get("title") or ""
83
+ created_utc = 0.0
84
+ if getattr(entry, "published_parsed", None):
85
+ try:
86
+ import calendar
87
+ created_utc = float(calendar.timegm(entry.published_parsed))
88
+ except Exception:
89
+ created_utc = 0.0
90
+ # Extract a crude text body from RSS summary/content for better AI signal
91
+ raw_summary = getattr(entry, "summary", "") or getattr(entry, "description", "") or ""
92
+ if raw_summary:
93
+ text = html.unescape(re.sub(r"<[^>]+>", " ", raw_summary)).strip()
94
+ else:
95
+ text = ""
96
+ posts.append(
97
+ {
98
+ "title": title,
99
+ "selftext": text,
100
+ "score": None,
101
+ "created_utc": created_utc,
102
+ "id": entry.get("id") or "",
103
+ "permalink": "",
104
+ "url": link,
105
+ "link_flair_text": None,
106
+ }
107
+ )
108
+ print(f"Reddit fetch source: RSS fallback ({len(posts)} items)")
109
+ return posts
110
 
111
 
112
  def _get_mistral_client() -> Mistral:
 
188
  for post in raw_posts:
189
  title = post.get("title", "").strip()
190
  selftext = post.get("selftext", "") or ""
191
+ raw_score = post.get("score")
192
+ score = int(raw_score) if raw_score is not None else 0
193
 
194
+ # Only filter by score when a real score is available
195
+ if raw_score is not None and score < min_score:
196
+ print(f"Skip by score: '{title[:80]}' score={score} < min_score={min_score}")
197
  continue
198
 
199
  try:
 
204
  continue
205
 
206
  if not should:
207
+ print(f"Classifier NO: '{title[:80]}'")
208
  continue
209
 
210
  try:
 
231
  flair=post.get("link_flair_text"),
232
  )
233
  )
234
+ print(f"Added: '{ai_title[:80]}'")
235
 
236
  print(f"Extraction complete: {len(pain_points)} pain points")
237
  return pain_points