diff --git a/Gemfile b/Gemfile index defca27..387f8c3 100644 --- a/Gemfile +++ b/Gemfile @@ -44,6 +44,8 @@ gem "bootsnap", require: false # Use Active Storage variants [https://guides.rubyonrails.org/active_storage_overview.html#transforming-images] # gem "image_processing", "~> 1.2" +gem 'mechanize' + group :development, :test do # See https://guides.rubyonrails.org/debugging_rails_applications.html#debugging-with-the-debug-gem gem "debug", platforms: %i[ mri mswin mswin64 mingw x64_mingw ] diff --git a/Gemfile.lock b/Gemfile.lock index f76c722..5a44cf8 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -122,10 +122,13 @@ GEM irb (~> 1.10) reline (>= 0.3.8) docile (1.4.1) + domain_name (0.6.20240107) drb (2.2.1) erubi (1.13.0) globalid (1.2.1) activesupport (>= 6.1) + http-cookie (1.0.7) + domain_name (~> 0.5) i18n (1.14.6) concurrent-ruby (~> 1.0) importmap-rails (2.0.1) @@ -153,10 +156,30 @@ GEM net-smtp marcel (1.0.4) matrix (0.4.2) + mechanize (2.12.2) + addressable (~> 2.8) + base64 + domain_name (~> 0.5, >= 0.5.20190701) + http-cookie (~> 1.0, >= 1.0.3) + mime-types (~> 3.3) + net-http-digest_auth (~> 1.4, >= 1.4.1) + net-http-persistent (>= 2.5.2, < 5.0.dev) + nkf + nokogiri (~> 1.11, >= 1.11.2) + rubyntlm (~> 0.6, >= 0.6.3) + webrick (~> 1.7) + webrobots (~> 0.1.2) + mime-types (3.6.0) + logger + mime-types-data (~> 3.2015) + mime-types-data (3.2024.1105) mini_mime (1.1.5) minitest (5.25.1) msgpack (1.7.2) mutex_m (0.2.0) + net-http-digest_auth (1.4.1) + net-http-persistent (4.0.4) + connection_pool (~> 2.2) net-imap (0.4.14) date net-protocol @@ -167,6 +190,7 @@ GEM net-smtp (0.5.0) net-protocol nio4r (2.7.3) + nkf (0.2.0) nokogiri (1.16.7-aarch64-linux) racc (~> 1.4) nokogiri (1.16.7-arm-linux) @@ -249,6 +273,8 @@ GEM rubocop-ast (1.32.0) parser (>= 3.3.1.0) ruby-progressbar (1.13.0) + rubyntlm (0.6.5) + base64 rubyzip (2.3.2) selenium-webdriver (4.23.0) base64 (~> 0.2) @@ -293,6 +319,7 @@ GEM bindex (>= 0.4.0) railties (>= 6.0.0) webrick (1.8.2) + webrobots (0.1.2) websocket (1.2.11) websocket-driver (0.7.6) websocket-extensions (>= 0.1.0) @@ -318,6 +345,7 @@ DEPENDENCIES debug importmap-rails jbuilder + mechanize pg puma (>= 5.0) rails (~> 7.1.3, >= 7.1.3.4) diff --git a/README.md b/README.md index f8c1d30..98fdc54 100644 --- a/README.md +++ b/README.md @@ -9,3 +9,9 @@ From `heroku run bash` Running bash on ⬢ blade-ruby-lang... up, run.7782 ~ $ ./bin/rails runner import.rb --list ruby-list --from 1001 --to 2000 ``` + +From web + +``` +~ $ ./bin/rails runner import_from_web.rb --list ruby-list --from 1001 --to 2000 +``` diff --git a/app/models/message.rb b/app/models/message.rb index 23c7329..8bbde08 100644 --- a/app/models/message.rb +++ b/app/models/message.rb @@ -2,6 +2,7 @@ BLADE_BUCKET_NAME = 'blade.ruby-lang.org' require 'kconv' +require 'mechanize' class Message < ApplicationRecord # Not really sure we will utlize this configuration, @@ -19,6 +20,15 @@ def self.from_s3(list_name, list_seq, s3_client = Aws::S3::Client.new(region: BL m end + def self.from_web(list_name, list_seq, web_client = Mechanize.new) + obj = web_client.get("https://blade.ruby-lang.org/#{list_name}/#{list_seq}") + + m = self.from_string(obj.body) + m.list_id = List.find_by_name(list_name).id + m.list_seq = list_seq + m + end + def self.from_string(str) # There are a few hacks to import messages from blade.ruby-lang.org's # S3 bucket. diff --git a/import_from_web.rb b/import_from_web.rb new file mode 100644 index 0000000..be06f62 --- /dev/null +++ b/import_from_web.rb @@ -0,0 +1,24 @@ +require 'optparse' + +params = {} +OptionParser.new do |opts| + opts.on('--list LIST') + opts.on('--from FROM', Integer) + opts.on('--to TO', Integer) +end.parse!(into: params) + +list = params[:list] + +(params[:from]..params[:to]).each do |seq| + begin + message = Message.from_web(list, seq) + message.save + rescue ActiveRecord::RecordNotUnique + STDERR.puts("#{list}:#{seq} already exists in Postgres") + rescue Aws::S3::Errors::NoSuchKey + STDERR.puts("#{list}:#{seq} doesn't exist in Web") + rescue StandardError => e + STDERR.puts("failed to import #{list}:#{seq}: #{e}") + end + sleep 1 +end