diff --git a/atom/parser.go b/atom/parser.go index 48e81823..4ab3e4e2 100644 --- a/atom/parser.go +++ b/atom/parser.go @@ -21,6 +21,9 @@ var ( "uri": true, "url": true, // atom 0.3 } + + // No known explicit extension parsers for Atom, currently + emptyExtParsers = make(shared.ExtParsers) ) // Parser is an Atom Parser @@ -38,6 +41,14 @@ func (ap *Parser) Parse(feed io.Reader) (*Feed, error) { return ap.parseRoot(p) } +func (ap *Parser) ParseAsExtension(p *xpp.XMLPullParser) (interface{}, error) { + entry := &Entry{} + if err := ap.parseEntryContent(p, entry); err != nil { + return nil, err + } + return entry, nil +} + func (ap *Parser) parseRoot(p *xpp.XMLPullParser) (*Feed, error) { if err := p.Expect(xpp.StartTag, "feed"); err != nil { return nil, err @@ -68,8 +79,8 @@ func (ap *Parser) parseRoot(p *xpp.XMLPullParser) (*Feed, error) { name := strings.ToLower(p.Name) - if shared.IsExtension(p) { - e, err := shared.ParseExtension(extensions, p) + if shared.IsExtension(p, emptyExtParsers) { + e, err := shared.ParseExtension(extensions, p, emptyExtParsers) if err != nil { return nil, err } @@ -215,103 +226,14 @@ func (ap *Parser) parseEntry(p *xpp.XMLPullParser) (*Entry, error) { } if tok == xpp.StartTag { - - name := strings.ToLower(p.Name) - - if shared.IsExtension(p) { - e, err := shared.ParseExtension(extensions, p) + if shared.IsExtension(p, emptyExtParsers) { + e, err := shared.ParseExtension(extensions, p, emptyExtParsers) if err != nil { return nil, err } extensions = e - } else if name == "title" { - result, err := ap.parseAtomText(p) - if err != nil { - return nil, err - } - entry.Title = result - } else if name == "id" { - result, err := ap.parseAtomText(p) - if err != nil { - return nil, err - } - entry.ID = result - } else if name == "rights" || - name == "copyright" { - result, err := ap.parseAtomText(p) - if err != nil { - return nil, err - } - entry.Rights = result - } else if name == "summary" { - result, err := ap.parseAtomText(p) - if err != nil { - return nil, err - } - entry.Summary = result - } else if name == "source" { - result, err := ap.parseSource(p) - if err != nil { - return nil, err - } - entry.Source = result - } else if name == "updated" || - name == "modified" { - result, err := ap.parseAtomText(p) - if err != nil { - return nil, err - } - entry.Updated = result - date, err := shared.ParseDate(result) - if err == nil { - utcDate := date.UTC() - entry.UpdatedParsed = &utcDate - } - } else if name == "contributor" { - result, err := ap.parsePerson("contributor", p) - if err != nil { - return nil, err - } - entry.Contributors = append(entry.Contributors, result) - } else if name == "author" { - result, err := ap.parsePerson("author", p) - if err != nil { - return nil, err - } - entry.Authors = append(entry.Authors, result) - } else if name == "category" { - result, err := ap.parseCategory(p) - if err != nil { - return nil, err - } - entry.Categories = append(entry.Categories, result) - } else if name == "link" { - result, err := ap.parseLink(p) - if err != nil { - return nil, err - } - entry.Links = append(entry.Links, result) - } else if name == "published" || - name == "issued" { - result, err := ap.parseAtomText(p) - if err != nil { - return nil, err - } - entry.Published = result - date, err := shared.ParseDate(result) - if err == nil { - utcDate := date.UTC() - entry.PublishedParsed = &utcDate - } - } else if name == "content" { - result, err := ap.parseContent(p) - if err != nil { - return nil, err - } - entry.Content = result } else { - err := p.Skip() - if err != nil { + if err := ap.parseEntryContent(p, entry); err != nil { return nil, err } } @@ -329,6 +251,103 @@ func (ap *Parser) parseEntry(p *xpp.XMLPullParser) (*Entry, error) { return entry, nil } +func (ap *Parser) parseEntryContent(p *xpp.XMLPullParser, entry *Entry) error { + name := strings.ToLower(p.Name) + + if name == "title" { + result, err := ap.parseAtomText(p) + if err != nil { + return err + } + entry.Title = result + } else if name == "id" { + result, err := ap.parseAtomText(p) + if err != nil { + return err + } + entry.ID = result + } else if name == "rights" || + name == "copyright" { + result, err := ap.parseAtomText(p) + if err != nil { + return err + } + entry.Rights = result + } else if name == "summary" { + result, err := ap.parseAtomText(p) + if err != nil { + return err + } + entry.Summary = result + } else if name == "source" { + result, err := ap.parseSource(p) + if err != nil { + return err + } + entry.Source = result + } else if name == "updated" || + name == "modified" { + result, err := ap.parseAtomText(p) + if err != nil { + return err + } + entry.Updated = result + date, err := shared.ParseDate(result) + if err == nil { + utcDate := date.UTC() + entry.UpdatedParsed = &utcDate + } + } else if name == "contributor" { + result, err := ap.parsePerson("contributor", p) + if err != nil { + return err + } + entry.Contributors = append(entry.Contributors, result) + } else if name == "author" { + result, err := ap.parsePerson("author", p) + if err != nil { + return err + } + entry.Authors = append(entry.Authors, result) + } else if name == "category" { + result, err := ap.parseCategory(p) + if err != nil { + return err + } + entry.Categories = append(entry.Categories, result) + } else if name == "link" { + result, err := ap.parseLink(p) + if err != nil { + return err + } + entry.Links = append(entry.Links, result) + } else if name == "published" || + name == "issued" { + result, err := ap.parseAtomText(p) + if err != nil { + return err + } + entry.Published = result + date, err := shared.ParseDate(result) + if err == nil { + utcDate := date.UTC() + entry.PublishedParsed = &utcDate + } + } else if name == "content" { + result, err := ap.parseContent(p) + if err != nil { + return err + } + entry.Content = result + } else { + err := p.Skip() + if err != nil { + return err + } + } + return nil +} + func (ap *Parser) parseSource(p *xpp.XMLPullParser) (*Source, error) { if err := p.Expect(xpp.StartTag, "source"); err != nil { @@ -357,8 +376,8 @@ func (ap *Parser) parseSource(p *xpp.XMLPullParser) (*Source, error) { name := strings.ToLower(p.Name) - if shared.IsExtension(p) { - e, err := shared.ParseExtension(extensions, p) + if shared.IsExtension(p, emptyExtParsers) { + e, err := shared.ParseExtension(extensions, p, emptyExtParsers) if err != nil { return nil, err } diff --git a/extensions/extensions.go b/extensions/extensions.go index 6c50d4aa..53021db8 100644 --- a/extensions/extensions.go +++ b/extensions/extensions.go @@ -12,6 +12,7 @@ type Extension struct { Value string `json:"value"` Attrs map[string]string `json:"attrs"` Children map[string][]Extension `json:"children"` + Parsed interface{} `json:"parsed,omitempty"` } func parseTextExtension(name string, extensions map[string][]Extension) (value string) { diff --git a/internal/shared/extparser.go b/internal/shared/extparser.go index 7139ac85..2b0e8cab 100644 --- a/internal/shared/extparser.go +++ b/internal/shared/extparser.go @@ -16,10 +16,16 @@ var knownPrefixes = []string{ "atom03", } +type ExtParser interface { + ParseAsExtension(p *xpp.XMLPullParser) (interface{}, error) +} + +type ExtParsers map[string]ExtParser + // IsExtension returns whether or not the current // XML element is an extension element (if it has a // non empty prefix) -func IsExtension(p *xpp.XMLPullParser) bool { +func IsExtension(p *xpp.XMLPullParser, extParsers ExtParsers) bool { space := strings.TrimSpace(p.Space) prefix := PrefixForNamespace(space, p) @@ -27,16 +33,27 @@ func IsExtension(p *xpp.XMLPullParser) bool { return false } + // we have an extension parser for this, so we treat it as an extension + if _, ok := extParsers[prefix]; ok { + return true + } + return !slices.Contains(knownPrefixes, prefix) } // ParseExtension parses the current element of the // XMLPullParser as an extension element and updates // the extension map -func ParseExtension(fe ext.Extensions, p *xpp.XMLPullParser) (ext.Extensions, error) { +func ParseExtension(fe ext.Extensions, p *xpp.XMLPullParser, extParsers ExtParsers) (ext.Extensions, error) { prefix := PrefixForNamespace(p.Space, p) - result, err := parseExtensionElement(p) + var result ext.Extension + var err error + if extParser, ok := extParsers[prefix]; ok { + result, err = parseExtensionFromParser(p, extParser) + } else { + result, err = parseExtensionElement(p) + } if err != nil { return nil, err } @@ -54,6 +71,23 @@ func ParseExtension(fe ext.Extensions, p *xpp.XMLPullParser) (ext.Extensions, er return fe, nil } +func parseExtensionFromParser(p *xpp.XMLPullParser, extParser ExtParser) (e ext.Extension, err error) { + if err = p.Expect(xpp.StartTag, "*"); err != nil { + return e, err + } + + e.Name = p.Name + if e.Parsed, err = extParser.ParseAsExtension(p); err != nil { + return e, err + } + + if err = p.Expect(xpp.EndTag, e.Name); err != nil { + return e, err + } + + return e, nil +} + func parseExtensionElement(p *xpp.XMLPullParser) (e ext.Extension, err error) { if err = p.Expect(xpp.StartTag, "*"); err != nil { return e, err diff --git a/parser.go b/parser.go index 74d29ca0..2d498bde 100644 --- a/parser.go +++ b/parser.go @@ -11,6 +11,7 @@ import ( "github.com/mmcdole/gofeed/atom" "github.com/mmcdole/gofeed/json" + "github.com/mmcdole/gofeed/internal/shared" "github.com/mmcdole/gofeed/rss" ) @@ -155,8 +156,19 @@ func (f *Parser) parseAtomFeed(feed io.Reader) (*Feed, error) { return f.atomTrans().Translate(af) } +func (f *Parser) BuildRSSExtParsers() shared.ExtParsers { + extParsers := make(shared.ExtParsers, 3) + + // all possible atom variants + extParsers["atom"] = f.ap + extParsers["atom10"] = f.ap + extParsers["atom03"] = f.ap + + return extParsers +} + func (f *Parser) parseRSSFeed(feed io.Reader) (*Feed, error) { - rf, err := f.rp.Parse(feed) + rf, err := f.rp.Parse(feed, f.BuildRSSExtParsers()) if err != nil { return nil, err } diff --git a/rss/parser.go b/rss/parser.go index 575896e9..51b79582 100644 --- a/rss/parser.go +++ b/rss/parser.go @@ -11,11 +11,14 @@ import ( ) // Parser is a RSS Parser -type Parser struct{} +type Parser struct { + extParsers shared.ExtParsers +} // Parse parses an xml feed into an rss.Feed -func (rp *Parser) Parse(feed io.Reader) (*Feed, error) { +func (rp *Parser) Parse(feed io.Reader, extParsers shared.ExtParsers) (*Feed, error) { p := xpp.NewXMLPullParser(feed, false, shared.NewReaderLabel) + rp.extParsers = extParsers _, err := shared.FindRoot(p) if err != nil { @@ -53,7 +56,7 @@ func (rp *Parser) parseRoot(p *xpp.XMLPullParser) (*Feed, error) { if tok == xpp.StartTag { // Skip any extensions found in the feed root. - if shared.IsExtension(p) { + if shared.IsExtension(p, rp.extParsers) { p.Skip() continue } @@ -140,8 +143,8 @@ func (rp *Parser) parseChannel(p *xpp.XMLPullParser) (rss *Feed, err error) { name := strings.ToLower(p.Name) - if shared.IsExtension(p) { - ext, err := shared.ParseExtension(extensions, p) + if shared.IsExtension(p, rp.extParsers) { + ext, err := shared.ParseExtension(extensions, p, rp.extParsers) if err != nil { return nil, err } @@ -337,8 +340,8 @@ func (rp *Parser) parseItem(p *xpp.XMLPullParser) (item *Item, err error) { name := strings.ToLower(p.Name) - if shared.IsExtension(p) { - ext, err := shared.ParseExtension(extensions, p) + if shared.IsExtension(p, rp.extParsers) { + ext, err := shared.ParseExtension(extensions, p, rp.extParsers) if err != nil { return nil, err } diff --git a/rss/parser_test.go b/rss/parser_test.go index 51af5781..3953e7b7 100644 --- a/rss/parser_test.go +++ b/rss/parser_test.go @@ -9,6 +9,7 @@ import ( "strings" "testing" + "github.com/mmcdole/gofeed" "github.com/mmcdole/gofeed/rss" "github.com/stretchr/testify/assert" ) @@ -27,7 +28,7 @@ func TestParser_Parse(t *testing.T) { // Parse actual feed fp := &rss.Parser{} - actual, _ := fp.Parse(bytes.NewReader(f)) + actual, _ := fp.Parse(bytes.NewReader(f), gofeed.NewParser().BuildRSSExtParsers()) // Get json encoded expected feed result ef := fmt.Sprintf("../testdata/parser/rss/%s.json", name) diff --git a/testdata/parser/rss/rss_channel_item_author_atom.json b/testdata/parser/rss/rss_channel_item_author_atom.json new file mode 100644 index 00000000..06fe10af --- /dev/null +++ b/testdata/parser/rss/rss_channel_item_author_atom.json @@ -0,0 +1,26 @@ +{ + "items": [ + { + "extensions": { + "atom": { + "author": [ + { + "name": "author", + "value": "", + "attrs": null, + "children": null, + "parsed": { + "authors": [ + { + "name": "Item Author" + } + ] + } + } + ] + } + } + } + ], + "version": "2.0" +} diff --git a/testdata/parser/rss/rss_channel_item_author_atom.xml b/testdata/parser/rss/rss_channel_item_author_atom.xml new file mode 100644 index 00000000..f444b0cc --- /dev/null +++ b/testdata/parser/rss/rss_channel_item_author_atom.xml @@ -0,0 +1,10 @@ + + + + + Item Author + + +