From f1f6ddd5362f40e287857750f5e102206bd0e169 Mon Sep 17 00:00:00 2001 From: Eugen Rochko Date: Mon, 7 Feb 2022 18:16:31 +0100 Subject: Fix structured data parsing from links choking on bad data (#17403) * Fix structured data parsing from links choking on bad data - Fix og:url meta tag being prioritized over canonical link tag - Fix structured data parsing choking on commented-out CDATA declarations - Fix HTML entities in title, description, provider_name, author_name - Change structured data parsing to attempt every JSON-LD script tag * Remove unnecessary slash escapes from CDATA regex pattern --- spec/lib/link_details_extractor_spec.rb | 122 ++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) (limited to 'spec/lib/link_details_extractor_spec.rb') diff --git a/spec/lib/link_details_extractor_spec.rb b/spec/lib/link_details_extractor_spec.rb index 850857b2d..84bb4579c 100644 --- a/spec/lib/link_details_extractor_spec.rb +++ b/spec/lib/link_details_extractor_spec.rb @@ -26,4 +26,126 @@ RSpec.describe LinkDetailsExtractor do end end end + + context 'when structured data is present' do + let(:original_url) { 'https://example.com/page.html' } + + context 'and is wrapped in CDATA tags' do + let(:html) { <<-HTML } + + + + + + + HTML + + describe '#title' do + it 'returns the title from structured data' do + expect(subject.title).to eq 'Foo' + end + end + + describe '#description' do + it 'returns the description from structured data' do + expect(subject.description).to eq 'Bar' + end + end + + describe '#provider_name' do + it 'returns the provider name from structured data' do + expect(subject.provider_name).to eq 'Baz' + end + end + + describe '#author_name' do + it 'returns the author name from structured data' do + expect(subject.author_name).to eq 'Hoge' + end + end + end + + context 'but the first tag is invalid JSON' do + let(:html) { <<-HTML } + + + + + + + + HTML + + describe '#title' do + it 'returns the title from structured data' do + expect(subject.title).to eq 'Foo' + end + end + + describe '#description' do + it 'returns the description from structured data' do + expect(subject.description).to eq 'Bar' + end + end + + describe '#provider_name' do + it 'returns the provider name from structured data' do + expect(subject.provider_name).to eq 'Baz' + end + end + + describe '#author_name' do + it 'returns the author name from structured data' do + expect(subject.author_name).to eq 'Hoge' + end + end + end + end end -- cgit