Skip to content

Commit

Permalink
Make Parser thread-safe (use new goxpp's base:xml state) (#202)
Browse files Browse the repository at this point in the history
* Add a test to detect race conditions with Parser

* Make Parser thread-safe (use new goxpp's base:xml state)

This changeset implements xml:base resolution based on state maintained
by the goxxp parser instead of using a shared urlStack. This allows
Parser to be safely used concurrently.

Depends on: mmcdole/goxpp#9

To test:
`go test -race`

* Depend on goxpp v1.1.0

Required for new thread-safe xml:base handling.

* Fix ResolveHTML

I had changed the name of UrlStack to BaseStack in the goxxp
implementation.
  • Loading branch information
cristoper committed Feb 28, 2023
1 parent 20c6436 commit 901f297
Show file tree
Hide file tree
Showing 8 changed files with 115 additions and 182 deletions.
40 changes: 14 additions & 26 deletions atom/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,35 +14,23 @@ import (
var (
// Atom elements which contain URIs
// https://tools.ietf.org/html/rfc4287
uriElements = map[string]bool{
atomUriElements = map[string]bool{
"icon": true,
"id": true,
"logo": true,
"uri": true,
"url": true, // atom 0.3
}

// Atom attributes which contain URIs
// https://tools.ietf.org/html/rfc4287
atomURIAttrs = map[string]bool{
"href": true,
"scheme": true,
"src": true,
"uri": true,
}
)

// Parser is an Atom Parser
type Parser struct {
base *shared.XMLBase
}
type Parser struct{}

// Parse parses an xml feed into an atom.Feed
func (ap *Parser) Parse(feed io.Reader) (*Feed, error) {
p := xpp.NewXMLPullParser(feed, false, shared.NewReaderLabel)
ap.base = &shared.XMLBase{URIAttrs: atomURIAttrs}

_, err := ap.base.FindRoot(p)
_, err := shared.FindRoot(p)
if err != nil {
return nil, err
}
Expand All @@ -67,7 +55,7 @@ func (ap *Parser) parseRoot(p *xpp.XMLPullParser) (*Feed, error) {
extensions := ext.Extensions{}

for {
tok, err := ap.base.NextTag(p)
tok, err := shared.NextTag(p)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -221,7 +209,7 @@ func (ap *Parser) parseEntry(p *xpp.XMLPullParser) (*Entry, error) {
extensions := ext.Extensions{}

for {
tok, err := ap.base.NextTag(p)
tok, err := shared.NextTag(p)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -376,7 +364,7 @@ func (ap *Parser) parseSource(p *xpp.XMLPullParser) (*Source, error) {
extensions := ext.Extensions{}

for {
tok, err := ap.base.NextTag(p)
tok, err := shared.NextTag(p)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -534,7 +522,7 @@ func (ap *Parser) parsePerson(name string, p *xpp.XMLPullParser) (*Person, error
person := &Person{}

for {
tok, err := ap.base.NextTag(p)
tok, err := shared.NextTag(p)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -684,7 +672,7 @@ func (ap *Parser) parseAtomText(p *xpp.XMLPullParser) (string, error) {
if strings.Contains(result, "<![CDATA[") {
result = shared.StripCDATA(result)
if lowerType == "html" || strings.Contains(lowerType, "xhtml") {
result, _ = ap.base.ResolveHTML(result)
result, _ = shared.ResolveHTML(p, result)
}
} else {
// decode non-CDATA contents depending on type
Expand All @@ -695,12 +683,12 @@ func (ap *Parser) parseAtomText(p *xpp.XMLPullParser) (string, error) {
result, err = shared.DecodeEntities(result)
} else if strings.Contains(lowerType, "xhtml") {
result = ap.stripWrappingDiv(result)
result, _ = ap.base.ResolveHTML(result)
result, _ = shared.ResolveHTML(p, result)
} else if lowerType == "html" {
result = ap.stripWrappingDiv(result)
result, err = shared.DecodeEntities(result)
if err == nil {
result, _ = ap.base.ResolveHTML(result)
result, _ = shared.ResolveHTML(p, result)
}
} else {
decodedStr, err := base64.StdEncoding.DecodeString(result)
Expand All @@ -712,10 +700,10 @@ func (ap *Parser) parseAtomText(p *xpp.XMLPullParser) (string, error) {

// resolve relative URIs in URI-containing elements according to xml:base
name := strings.ToLower(p.Name)
if uriElements[name] {
resolved, err := ap.base.ResolveURL(result)
if err == nil {
result = resolved
if atomUriElements[name] {
resolved, err := p.XmlBaseResolveUrl(result)
if resolved != nil && err == nil {
result = resolved.String()
}
}

Expand Down
3 changes: 1 addition & 2 deletions detector.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,7 @@ func DetectFeedType(feed io.Reader) FeedType {
// Check if it's an XML based feed
p := xpp.NewXMLPullParser(bytes.NewReader(buffer.Bytes()), false, shared.NewReaderLabel)

xmlBase := shared.XMLBase{}
_, err := xmlBase.FindRoot(p)
_, err := shared.FindRoot(p)
if err != nil {
return FeedTypeUnknown
}
Expand Down
5 changes: 3 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ go 1.19
require (
github.com/PuerkitoBio/goquery v1.8.0
github.com/json-iterator/go v1.1.12
github.com/mmcdole/goxpp v0.0.0-20200921145534-2f3784f67354
github.com/stretchr/testify v1.3.0
github.com/mmcdole/goxpp v1.1.0
github.com/stretchr/testify v1.8.1
github.com/urfave/cli v1.22.3
golang.org/x/net v0.4.0
golang.org/x/text v0.5.0
Expand All @@ -21,4 +21,5 @@ require (
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/russross/blackfriday/v2 v2.0.1 // indirect
github.com/shurcooL/sanitized_anchor_name v1.0.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
15 changes: 12 additions & 3 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/mmcdole/goxpp v0.0.0-20200921145534-2f3784f67354 h1:Z6i7ND25ixRtXFBylIUggqpvLMV1I15yprcqMVB7WZA=
github.com/mmcdole/goxpp v0.0.0-20200921145534-2f3784f67354/go.mod h1:pasqhqstspkosTneA62Nc+2p9SOBBYAPbnmRRWPQ0V8=
github.com/mmcdole/goxpp v1.1.0 h1:WwslZNF7KNAXTFuzRtn/OKZxFLJAAyOA9w82mDz2ZGI=
github.com/mmcdole/goxpp v1.1.0/go.mod h1:v+25+lT2ViuQ7mVxcncQ8ch1URund48oH+jhjiwEgS8=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
Expand All @@ -25,8 +25,13 @@ github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQD
github.com/shurcooL/sanitized_anchor_name v1.0.0 h1:PdmoCO6wvbs+7yrJyMORt4/BmY5IYyJwS/kOiWx8mHo=
github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/urfave/cli v1.22.3 h1:FpNT6zq26xNpHZy08emi755QwzLPs6Pukqjlc7RfOMU=
github.com/urfave/cli v1.22.3/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0=
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
Expand All @@ -39,5 +44,9 @@ golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.5.0 h1:OLmvp0KP+FVG99Ct/qFiL/Fhk4zp4QQnZ7b2U+5piUM=
golang.org/x/text v0.5.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
21 changes: 21 additions & 0 deletions internal/shared/parseutils.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package shared
import (
"bytes"
"errors"
"fmt"
"html"
"regexp"
"strings"
Expand All @@ -23,6 +24,26 @@ var (
const CDATA_START = "<![CDATA["
const CDATA_END = "]]>"

// FindRoot iterates through the tokens of an xml document until
// it encounters its first StartTag event. It returns an error
// if it reaches EndDocument before finding a tag.
func FindRoot(p *xpp.XMLPullParser) (event xpp.XMLEventType, err error) {
for {
event, err = p.Next()
if err != nil {
return event, err
}
if event == xpp.StartTag {
break
}

if event == xpp.EndDocument {
return event, fmt.Errorf("Failed to find root node before document end.")
}
}
return
}

// ParseText is a helper function for parsing the text
// from the current element of the XMLPullParser.
// This function can handle parsing naked XML text from
Expand Down

0 comments on commit 901f297

Please sign in to comment.