From 53aca8aecf33ca47825da8b2d10ecff089df5c64 Mon Sep 17 00:00:00 2001 From: OSAMU SATO Date: Fri, 26 Nov 2021 16:29:53 +0900 Subject: Add batch_size option to bin/tootctl search deploy (#17049) --- lib/mastodon/search_cli.rb | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/mastodon/search_cli.rb b/lib/mastodon/search_cli.rb index 2d1ca1c05..e36af24f4 100644 --- a/lib/mastodon/search_cli.rb +++ b/lib/mastodon/search_cli.rb @@ -17,6 +17,7 @@ module Mastodon ].freeze option :concurrency, type: :numeric, default: 2, aliases: [:c], desc: 'Workload will be split between this number of threads' + option :batch_size, type: :numeric, default: 1_000, aliases: [:b], desc: 'Number of records in each batch' option :only, type: :array, enum: %w(accounts tags statuses), desc: 'Only process these indices' desc 'deploy', 'Create or upgrade ElasticSearch indices and populate them' long_desc <<~LONG_DESC @@ -35,6 +36,11 @@ module Mastodon exit(1) end + if options[:batch_size] < 1 + say('Cannot run with this batch_size setting, must be at least 1', :red) + exit(1) + end + indices = begin if options[:only] options[:only].map { |str| "#{str.camelize}Index".constantize } @@ -73,7 +79,7 @@ module Mastodon # is uneconomical. So we only ever add. indices.each do |index| progress.title = "Importing #{index} " - batch_size = 1_000 + batch_size = options[:batch_size] slice_size = (batch_size / options[:concurrency]).ceil index.adapter.default_scope.reorder(nil).find_in_batches(batch_size: batch_size) do |batch| -- cgit From 06631fdc53d6c464aa9db4e31d35f1fe354bcc95 Mon Sep 17 00:00:00 2001 From: Takeshi Umeda Date: Fri, 26 Nov 2021 16:30:02 +0900 Subject: Fix ElasticSearch to Elasticsearch (#17050) --- .env.nanobox | 2 +- .env.production.sample | 2 +- config/initializers/chewy.rb | 6 +++--- lib/mastodon/search_cli.rb | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'lib') diff --git a/.env.nanobox b/.env.nanobox index d61673836..ad941c947 100644 --- a/.env.nanobox +++ b/.env.nanobox @@ -13,7 +13,7 @@ DB_PORT=5432 # DATABASE_URL=postgresql://$DATA_DB_USER:$DATA_DB_PASS@$DATA_DB_HOST/gonano -# Optional ElasticSearch configuration +# Optional Elasticsearch configuration ES_ENABLED=true ES_HOST=$DATA_ELASTIC_HOST ES_PORT=9200 diff --git a/.env.production.sample b/.env.production.sample index c24c31c9b..8eeff3794 100644 --- a/.env.production.sample +++ b/.env.production.sample @@ -29,7 +29,7 @@ DB_NAME=mastodon_production DB_PASS= DB_PORT=5432 -# ElasticSearch (optional) +# Elasticsearch (optional) # ------------------------ ES_ENABLED=true ES_HOST=localhost diff --git a/config/initializers/chewy.rb b/config/initializers/chewy.rb index fbbcbbcde..f303fc54d 100644 --- a/config/initializers/chewy.rb +++ b/config/initializers/chewy.rb @@ -17,7 +17,7 @@ Chewy.settings = { } # We use our own async strategy even outside the request-response -# cycle, which takes care of checking if ElasticSearch is enabled +# cycle, which takes care of checking if Elasticsearch is enabled # or not. However, mind that for the Rails console, the :urgent # strategy is set automatically with no way to override it. Chewy.root_strategy = :custom_sidekiq @@ -32,8 +32,8 @@ module Chewy end end -# ElasticSearch uses Faraday internally. Faraday interprets the +# Elasticsearch uses Faraday internally. Faraday interprets the # http_proxy env variable by default which leads to issues when # Mastodon is run with hidden services enabled, because -# ElasticSearch is *not* supposed to be accessed through a proxy +# Elasticsearch is *not* supposed to be accessed through a proxy Faraday.ignore_env_proxy = true diff --git a/lib/mastodon/search_cli.rb b/lib/mastodon/search_cli.rb index e36af24f4..6ad9d7b6a 100644 --- a/lib/mastodon/search_cli.rb +++ b/lib/mastodon/search_cli.rb @@ -19,9 +19,9 @@ module Mastodon option :concurrency, type: :numeric, default: 2, aliases: [:c], desc: 'Workload will be split between this number of threads' option :batch_size, type: :numeric, default: 1_000, aliases: [:b], desc: 'Number of records in each batch' option :only, type: :array, enum: %w(accounts tags statuses), desc: 'Only process these indices' - desc 'deploy', 'Create or upgrade ElasticSearch indices and populate them' + desc 'deploy', 'Create or upgrade Elasticsearch indices and populate them' long_desc <<~LONG_DESC - If ElasticSearch is empty, this command will create the necessary indices + If Elasticsearch is empty, this command will create the necessary indices and then import data from the database into those indices. This command will also upgrade indices if the underlying schema has been -- cgit From 0ac7efdc50200020f42b2d94151ecd80144e3d8c Mon Sep 17 00:00:00 2001 From: Takeshi Umeda Date: Sat, 27 Nov 2021 06:08:47 +0900 Subject: Fix performance of tootctl statuses remove (#17052) * Fix performance of tootctl statuses remove * Fix model class --- lib/mastodon/statuses_cli.rb | 94 +++++++++++++++++++++++++++++++------------- 1 file changed, 67 insertions(+), 27 deletions(-) (limited to 'lib') diff --git a/lib/mastodon/statuses_cli.rb b/lib/mastodon/statuses_cli.rb index b9dccdd8a..f841529e0 100644 --- a/lib/mastodon/statuses_cli.rb +++ b/lib/mastodon/statuses_cli.rb @@ -6,6 +6,7 @@ require_relative 'cli_helper' module Mastodon class StatusesCLI < Thor + include CLIHelper include ActionView::Helpers::NumberHelper def self.exit_on_failure? @@ -15,6 +16,8 @@ module Mastodon option :days, type: :numeric, default: 90 option :clean_followed, type: :boolean option :skip_media_remove, type: :boolean + option :vacuum, type: :boolean, default: false, desc: 'Reduce the file size and update the statistics. This option locks the table for a long time, so run it offline' + option :batch_size, type: :numeric, default: 1_000, aliases: [:b], desc: 'Number of records in each batch' desc 'remove', 'Remove unreferenced statuses' long_desc <<~LONG_DESC Remove statuses that are not referenced by local user activity, such as @@ -25,52 +28,89 @@ module Mastodon indices before commencing, and removes them afterward. LONG_DESC def remove + if options[:batch_size] < 1 + say('Cannot run with this batch_size setting, must be at least 1', :red) + exit(1) + end + say('Creating temporary database indices...') - ActiveRecord::Base.connection.add_index(:accounts, :id, name: :index_accounts_local, where: 'domain is null', algorithm: :concurrently) unless ActiveRecord::Base.connection.index_name_exists?(:accounts, :index_accounts_local) - ActiveRecord::Base.connection.add_index(:status_pins, :status_id, name: :index_status_pins_status_id, algorithm: :concurrently) unless ActiveRecord::Base.connection.index_name_exists?(:status_pins, :index_status_pins_status_id) - ActiveRecord::Base.connection.add_index(:media_attachments, :remote_url, name: :index_media_attachments_remote_url, where: 'remote_url is not null', algorithm: :concurrently) unless ActiveRecord::Base.connection.index_name_exists?(:media_attachments, :index_media_attachments_remote_url) + ActiveRecord::Base.connection.add_index(:accounts, :id, name: :index_accounts_local, where: 'domain is null', algorithm: :concurrently, if_not_exists: true) + ActiveRecord::Base.connection.add_index(:status_pins, :status_id, name: :index_status_pins_status_id, algorithm: :concurrently, if_not_exists: true) + ActiveRecord::Base.connection.add_index(:media_attachments, :remote_url, name: :index_media_attachments_remote_url, where: 'remote_url is not null', algorithm: :concurrently, if_not_exists: true) max_id = Mastodon::Snowflake.id_at(options[:days].days.ago) start_at = Time.now.to_f + say('Extract the deletion target... This might take a while...') + + ActiveRecord::Base.connection.create_table('statuses_to_be_deleted', temporary: true) + + # Skip accounts followed by local accounts + clean_followed_sql = 'AND NOT EXISTS (SELECT 1 FROM follows WHERE statuses.account_id = follows.target_account_id)' unless options[:clean_followed] + + ActiveRecord::Base.connection.exec_insert(<<-SQL.squish, 'SQL', [[nil, max_id]]) + INSERT INTO statuses_to_be_deleted (id) + SELECT statuses.id FROM statuses WHERE deleted_at IS NULL AND NOT local AND uri IS NOT NULL AND (id < $1) + AND NOT EXISTS (SELECT 1 FROM statuses AS statuses1 WHERE statuses.id = statuses1.in_reply_to_id) + AND NOT EXISTS (SELECT 1 FROM statuses AS statuses1 WHERE statuses1.id = statuses.reblog_of_id AND (statuses1.uri IS NULL OR statuses1.local)) + AND NOT EXISTS (SELECT 1 FROM statuses AS statuses1 WHERE statuses.id = statuses1.reblog_of_id AND (statuses1.uri IS NULL OR statuses1.local OR statuses1.id >= $1)) + AND NOT EXISTS (SELECT 1 FROM status_pins WHERE statuses.id = status_id) + AND NOT EXISTS (SELECT 1 FROM mentions WHERE statuses.id = mentions.status_id AND mentions.account_id IN (SELECT accounts.id FROM accounts WHERE domain IS NULL)) + AND NOT EXISTS (SELECT 1 FROM favourites WHERE statuses.id = favourites.status_id AND favourites.account_id IN (SELECT accounts.id FROM accounts WHERE domain IS NULL)) + AND NOT EXISTS (SELECT 1 FROM bookmarks WHERE statuses.id = bookmarks.status_id AND bookmarks.account_id IN (SELECT accounts.id FROM accounts WHERE domain IS NULL)) + #{clean_followed_sql} + SQL + + say('Removing temporary database indices to restore write performance...') + + ActiveRecord::Base.connection.remove_index(:accounts, name: :index_accounts_local, if_exists: true) + ActiveRecord::Base.connection.remove_index(:status_pins, name: :index_status_pins_status_id, if_exists: true) + say('Beginning removal... This might take a while...') - scope = Status.remote.where('id < ?', max_id) - # Skip reblogs of local statuses - scope = scope.where('reblog_of_id NOT IN (SELECT statuses1.id FROM statuses AS statuses1 WHERE statuses1.id = statuses.reblog_of_id AND (statuses1.uri IS NULL OR statuses1.local))') - # Skip statuses that are pinned on profiles - scope = scope.where('id NOT IN (SELECT status_pins.status_id FROM status_pins WHERE statuses.id = status_id)') - # Skip statuses that mention local accounts - scope = scope.where('id NOT IN (SELECT mentions.status_id FROM mentions WHERE statuses.id = mentions.status_id AND mentions.account_id IN (SELECT accounts.id FROM accounts WHERE domain IS NULL))') - # Skip statuses which have replies - scope = scope.where('id NOT IN (SELECT statuses1.in_reply_to_id FROM statuses AS statuses1 WHERE statuses.id = statuses1.in_reply_to_id)') - # Skip statuses reblogged by local accounts or with recent boosts - scope = scope.where('id NOT IN (SELECT statuses1.reblog_of_id FROM statuses AS statuses1 WHERE statuses.id = statuses1.reblog_of_id AND (statuses1.uri IS NULL OR statuses1.local OR statuses1.id >= ?))', max_id) - # Skip statuses favourited by local users - scope = scope.where('id NOT IN (SELECT favourites.status_id FROM favourites WHERE statuses.id = favourites.status_id AND favourites.account_id IN (SELECT accounts.id FROM accounts WHERE domain IS NULL))') - # Skip statuses bookmarked by local users - scope = scope.where('id NOT IN (SELECT bookmarks.status_id FROM bookmarks WHERE statuses.id = bookmarks.status_id AND bookmarks.account_id IN (SELECT accounts.id FROM accounts WHERE domain IS NULL))') - - unless options[:clean_followed] - # Skip accounts followed by local accounts - scope = scope.where('account_id NOT IN (SELECT follows.target_account_id FROM follows WHERE statuses.account_id = follows.target_account_id)') + klass = Class.new(ApplicationRecord) do |c| + c.table_name = 'statuses_to_be_deleted' end - scope.in_batches.delete_all + Object.const_set('StatusToBeDeleted', klass) + + scope = StatusToBeDeleted + processed = 0 + removed = 0 + progress = create_progress_bar(scope.count.fdiv(options[:batch_size]).ceil) + + scope.reorder(nil).in_batches(of: options[:batch_size]) do |relation| + ids = relation.pluck(:id) + processed += ids.count + removed += Status.unscoped.where(id: ids).delete_all + progress.increment + end + + progress.stop + + if options[:vacuum] + say('Run VACUUM and ANALYZE to statuses...') + + ActiveRecord::Base.connection.execute('VACUUM FULL ANALYZE statuses') + else + say('Run ANALYZE to statuses...') + + ActiveRecord::Base.connection.execute('ANALYZE statuses') + end unless options[:skip_media_remove] say('Beginning removal of now-orphaned media attachments to free up disk space...') Scheduler::MediaCleanupScheduler.new.perform end - say("Done after #{Time.now.to_f - start_at}s", :green) + say("Done after #{Time.now.to_f - start_at}s, removed #{removed} out of #{processed} statuses.", :green) ensure say('Removing temporary database indices to restore write performance...') - ActiveRecord::Base.connection.remove_index(:accounts, name: :index_accounts_local) if ActiveRecord::Base.connection.index_name_exists?(:accounts, :index_accounts_local) - ActiveRecord::Base.connection.remove_index(:status_pins, name: :index_status_pins_status_id) if ActiveRecord::Base.connection.index_name_exists?(:status_pins, :index_status_pins_status_id) - ActiveRecord::Base.connection.remove_index(:media_attachments, name: :index_media_attachments_remote_url) if ActiveRecord::Base.connection.index_name_exists?(:media_attachments, :index_media_attachments_remote_url) + ActiveRecord::Base.connection.remove_index(:accounts, name: :index_accounts_local, if_exists: true) + ActiveRecord::Base.connection.remove_index(:status_pins, name: :index_status_pins_status_id, if_exists: true) + ActiveRecord::Base.connection.remove_index(:media_attachments, name: :index_media_attachments_remote_url, if_exists: true) end end end -- cgit