Skip to content

Commit fb021c0

Browse files
committed
increased robustness
1 parent 1f9f5c5 commit fb021c0

File tree

2 files changed

+49
-3
lines changed

2 files changed

+49
-3
lines changed

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[metadata]
22
name = fastfeedparser
3-
version = 0.4.1
3+
version = 0.4.2
44
author = Vladimir Prelovac
55
author_email = [email protected]
66
description = High performance RSS, Atom and RDF parser in Python

src/fastfeedparser/main.py

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,17 @@ def parse(source: str | bytes) -> FastFeedParserDict:
198198
channel = child
199199
break
200200
if channel is None:
201-
raise ValueError("Invalid RSS feed: missing channel element")
201+
# Fallback: Check if this is a malformed RSS with Atom-style elements
202+
# This handles feeds like seancdavis.com that declare RSS but use Atom structure
203+
has_atom_elements = any(
204+
child.tag in ['entry', 'title', 'subtitle', 'updated', 'id', 'author', 'link']
205+
for child in root
206+
)
207+
if has_atom_elements:
208+
# Treat the RSS root as the channel for malformed feeds
209+
channel = root
210+
else:
211+
raise ValueError("Invalid RSS feed: missing channel element")
202212
# Find items with or without namespace
203213
items = channel.findall("item")
204214
if not items:
@@ -214,6 +224,20 @@ def parse(source: str | bytes) -> FastFeedParserDict:
214224
# Try recursive search for deeply nested items (minified feeds)
215225
if not items:
216226
items = channel.xpath(".//item") or channel.xpath(".//*[local-name()='item']")
227+
228+
# Fallback for malformed RSS: look for Atom-style <entry> elements
229+
if not items:
230+
items = channel.findall("entry")
231+
if not items:
232+
# Try to find entries with any namespace
233+
for child in channel:
234+
if child.tag.endswith("}entry") or child.tag == "entry":
235+
if not items:
236+
items = []
237+
items.append(child)
238+
# If still no entries found using findall with any namespace
239+
if not items:
240+
items = [child for child in channel if child.tag.endswith("}entry") or child.tag == "entry"]
217241
elif root.tag.endswith("}feed"):
218242
# Detect Atom namespace dynamically
219243
if "{http://www.w3.org/2005/Atom}" in root.tag:
@@ -834,7 +858,18 @@ def wrapper(
834858
else:
835859
result = _get_element_value(root, atom_css) or _get_element_value(root, rdf_css)
836860

837-
return result
861+
if result:
862+
return result
863+
864+
# Try unnamespaced Atom fields for malformed RSS feeds like seancdavis.com
865+
# Extract the local name from the namespaced atom_css
866+
if atom_css.startswith("{") and "}" in atom_css:
867+
unnamespaced_atom = atom_css.split("}", 1)[1]
868+
result = _get_element_value(root, unnamespaced_atom)
869+
if result:
870+
return result
871+
872+
return None
838873

839874
elif feed_type == "atom":
840875

@@ -920,6 +955,17 @@ def _parse_date(date_str: str) -> Optional[str]:
920955
if not date_str:
921956
return None
922957

958+
# Fix invalid leap year dates (Feb 29 in non-leap years)
959+
# This handles feeds with incorrect dates like "2023-02-29"
960+
import re
961+
if re.match(r'(\d{4})-02-29', date_str):
962+
year_match = re.match(r'(\d{4})-02-29', date_str)
963+
if year_match:
964+
year = int(year_match.group(1))
965+
if not ((year % 4 == 0 and year % 100 != 0) or (year % 400 == 0)):
966+
# Not a leap year, change Feb 29 to Feb 28
967+
date_str = date_str.replace(f'{year}-02-29', f'{year}-02-28')
968+
923969
# Try dateutil.parser first
924970
try:
925971
dt = dateutil_parser.parse(date_str, tzinfos=custom_tzinfos, ignoretz=False)

0 commit comments

Comments
 (0)