diff options
author | Eugen Rochko <eugen@zeonfederated.com> | 2022-02-07 18:16:31 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-02-07 18:16:31 +0100 |
commit | f1f6ddd5362f40e287857750f5e102206bd0e169 (patch) | |
tree | 6080f31997793f1a3898eae32de3211d9cab41f2 | |
parent | 73a782391ca3bc5cbb24fb98065f6a5f4d64f22c (diff) |
Fix structured data parsing from links choking on bad data (#17403)
* Fix structured data parsing from links choking on bad data - Fix og:url meta tag being prioritized over canonical link tag - Fix structured data parsing choking on commented-out CDATA declarations - Fix HTML entities in title, description, provider_name, author_name - Change structured data parsing to attempt every JSON-LD script tag * Remove unnecessary slash escapes from CDATA regex pattern
-rw-r--r-- | app/lib/link_details_extractor.rb | 53 | ||||
-rw-r--r-- | spec/lib/link_details_extractor_spec.rb | 122 |
2 files changed, 166 insertions, 9 deletions
diff --git a/app/lib/link_details_extractor.rb b/app/lib/link_details_extractor.rb index 56ad0717b..d2bcf0c25 100644 --- a/app/lib/link_details_extractor.rb +++ b/app/lib/link_details_extractor.rb @@ -3,6 +3,19 @@ class LinkDetailsExtractor include ActionView::Helpers::TagHelper + # Some publications wrap their JSON-LD data in their <script> tags + # in commented-out CDATA blocks, they need to be removed before + # attempting to parse JSON + CDATA_JUNK_PATTERN = %r{^[\s]*( + (/\*[\s]*<!\[CDATA\[[\s]*\*/) # Block comment style opening + | + (//[\s]*<!\[CDATA\[) # Single-line comment style opening + | + (/\*[\s]*\]\]>[\s]*\*/) # Block comment style closing + | + (//[\s]*\]\]>) # Single-line comment style closing + )[\s]*$}x + class StructuredData SUPPORTED_TYPES = %w( NewsArticle @@ -61,6 +74,10 @@ class LinkDetailsExtractor publisher.dig('logo', 'url') end + def valid? + json.present? + end + private def author @@ -134,11 +151,11 @@ class LinkDetailsExtractor end def title - structured_data&.headline || opengraph_tag('og:title') || document.xpath('//title').map(&:content).first + html_entities.decode(structured_data&.headline || opengraph_tag('og:title') || document.xpath('//title').map(&:content).first) end def description - structured_data&.description || opengraph_tag('og:description') || meta_tag('description') + html_entities.decode(structured_data&.description || opengraph_tag('og:description') || meta_tag('description')) end def image @@ -146,11 +163,11 @@ class LinkDetailsExtractor end def canonical_url - valid_url_or_nil(opengraph_tag('og:url') || link_tag('canonical'), same_origin_only: true) || @original_url.to_s + valid_url_or_nil(link_tag('canonical') || opengraph_tag('og:url'), same_origin_only: true) || @original_url.to_s end def provider_name - structured_data&.publisher_name || opengraph_tag('og:site_name') + html_entities.decode(structured_data&.publisher_name || opengraph_tag('og:site_name')) end def provider_url @@ -158,7 +175,7 @@ class LinkDetailsExtractor end def author_name - structured_data&.author_name || opengraph_tag('og:author') || opengraph_tag('og:author:username') + html_entities.decode(structured_data&.author_name || opengraph_tag('og:author') || opengraph_tag('og:author:username')) end def author_url @@ -223,10 +240,24 @@ class LinkDetailsExtractor def structured_data @structured_data ||= begin - json_ld = document.xpath('//script[@type="application/ld+json"]').map(&:content).first - json_ld.present? ? StructuredData.new(json_ld) : nil - rescue Oj::ParseError - nil + # Some publications have more than one JSON-LD definition on the page, + # and some of those definitions aren't valid JSON either, so we have + # to loop through here until we find something that is the right type + # and doesn't break + document.xpath('//script[@type="application/ld+json"]').filter_map do |element| + json_ld = element.content&.gsub(CDATA_JUNK_PATTERN, '') + + next if json_ld.blank? + + structured_data = StructuredData.new(html_entities.decode(json_ld)) + + next unless structured_data.valid? + + structured_data + rescue Oj::ParseError, EncodingError + Rails.logger.debug("Invalid JSON-LD in #{@original_url}") + next + end.first end end @@ -246,4 +277,8 @@ class LinkDetailsExtractor detector.strip_tags = true end end + + def html_entities + @html_entities ||= HTMLEntities.new + end end diff --git a/spec/lib/link_details_extractor_spec.rb b/spec/lib/link_details_extractor_spec.rb index 850857b2d..84bb4579c 100644 --- a/spec/lib/link_details_extractor_spec.rb +++ b/spec/lib/link_details_extractor_spec.rb @@ -26,4 +26,126 @@ RSpec.describe LinkDetailsExtractor do end end end + + context 'when structured data is present' do + let(:original_url) { 'https://example.com/page.html' } + + context 'and is wrapped in CDATA tags' do + let(:html) { <<-HTML } +<!doctype html> +<html> +<head> + <script type="application/ld+json"> + //<![CDATA[ + {"@context":"http://schema.org","@type":"NewsArticle","mainEntityOfPage":"https://example.com/page.html","headline":"Foo","datePublished":"2022-01-31T19:53:00+00:00","url":"https://example.com/page.html","description":"Bar","author":{"@type":"Person","name":"Hoge"},"publisher":{"@type":"Organization","name":"Baz"}} + //]]> + </script> +</head> +</html> + HTML + + describe '#title' do + it 'returns the title from structured data' do + expect(subject.title).to eq 'Foo' + end + end + + describe '#description' do + it 'returns the description from structured data' do + expect(subject.description).to eq 'Bar' + end + end + + describe '#provider_name' do + it 'returns the provider name from structured data' do + expect(subject.provider_name).to eq 'Baz' + end + end + + describe '#author_name' do + it 'returns the author name from structured data' do + expect(subject.author_name).to eq 'Hoge' + end + end + end + + context 'but the first tag is invalid JSON' do + let(:html) { <<-HTML } +<!doctype html> +<html> +<body> + <script type="application/ld+json"> + { + "@context":"https://schema.org", + "@type":"ItemList", + "url":"https://example.com/page.html", + "name":"Foo", + "description":"Bar" + }, + { + "@context": "https://schema.org", + "@type": "BreadcrumbList", + "itemListElement":[ + { + "@type":"ListItem", + "position":1, + "item":{ + "@id":"https://www.example.com", + "name":"Baz" + } + } + ] + } + </script> + <script type="application/ld+json"> + { + "@context":"https://schema.org", + "@type":"NewsArticle", + "mainEntityOfPage": { + "@type":"WebPage", + "@id": "http://example.com/page.html" + }, + "headline": "Foo", + "description": "Bar", + "datePublished": "2022-01-31T19:46:00+00:00", + "author": { + "@type": "Organization", + "name": "Hoge" + }, + "publisher": { + "@type": "NewsMediaOrganization", + "name":"Baz", + "url":"https://example.com/" + } + } + </script> +</body> +</html> + HTML + + describe '#title' do + it 'returns the title from structured data' do + expect(subject.title).to eq 'Foo' + end + end + + describe '#description' do + it 'returns the description from structured data' do + expect(subject.description).to eq 'Bar' + end + end + + describe '#provider_name' do + it 'returns the provider name from structured data' do + expect(subject.provider_name).to eq 'Baz' + end + end + + describe '#author_name' do + it 'returns the author name from structured data' do + expect(subject.author_name).to eq 'Hoge' + end + end + end + end end |