Merge pull request #104 from embulk/remove-prevent_duplicate_insert

sonots · web-flow · commit 455de15da5c2 · 2019-08-10T22:18:41.000+09:00
Drop prevent_duplicate_insert which has no use-case now
diff --git a/README.md b/README.md
@@ -50,7 +50,6 @@ v0.3.x has incompatibility changes with v0.2.x. Please see [CHANGELOG.md](CHANGE
 |  auto_create_table                   | boolean     | optional   | false                    | See [Dynamic Table Creating](#dynamic-table-creating) and [Time Partitioning](#time-partitioning) |
 |  schema_file                         | string      | optional   |                          | /path/to/schema.json |
 |  template_table                      | string      | optional   |                          | template table name. See [Dynamic Table Creating](#dynamic-table-creating) |
-|  prevent_duplicate_insert            | boolean     | optional   | false                    | See [Prevent Duplication](#prevent-duplication) |
 |  job_status_max_polling_time         | int         | optional   | 3600 sec                 | Max job status polling time |
 |  job_status_polling_interval         | int         | optional   | 10 sec                   | Job status polling interval |
 |  is_skip_job_result_check            | boolean     | optional   | false                    | Skip waiting Load job finishes. Available for append, or delete_in_advance mode |
@@ -354,22 +353,6 @@ out:
   payload_column_index: 0 # or, payload_column: payload
 ```
 
-### Prevent Duplication
-
-`prevent_duplicate_insert` option is used to prevent inserting same data for modes `append` or `append_direct`.
-
-When `prevent_duplicate_insert` is set to true, embulk-output-bigquery generate job ID from md5 hash of file and other options.
-
-`job ID = md5(md5(file) + dataset + table + schema + source_format + file_delimiter + max_bad_records + encoding + ignore_unknown_values + allow_quoted_newlines)`
-
-[job ID must be unique(including failures)](https://cloud.google.com/bigquery/loading-data-into-bigquery#consistency) so that same data can't be inserted with same settings repeatedly.
-
-```yaml
-out:
-  type: bigquery
-  prevent_duplicate_insert: true
-```
-
 ### GCS Bucket
 
 This is useful to reduce number of consumed jobs, which is limited by [100,000 jobs per project per day](https://cloud.google.com/bigquery/quotas#load_jobs).
diff --git a/example/config_prevent_duplicate_insert.yml b/example/config_prevent_duplicate_insert.yml
diff --git a/lib/embulk/output/bigquery.rb b/lib/embulk/output/bigquery.rb
@@ -53,7 +53,6 @@ def self.configure(config, schema, task_count)
           'job_status_max_polling_time'    => config.param('job_status_max_polling_time',    :integer, :default => 3600),
           'job_status_polling_interval'    => config.param('job_status_polling_interval',    :integer, :default => 10),
           'is_skip_job_result_check'       => config.param('is_skip_job_result_check',       :bool,    :default => false),
-          'prevent_duplicate_insert'       => config.param('prevent_duplicate_insert',       :bool,    :default => false),
           'with_rehearsal'                 => config.param('with_rehearsal',                 :bool,    :default => false),
           'rehearsal_counts'               => config.param('rehearsal_counts',               :integer, :default => 1000),
           'abort_on_error'                 => config.param('abort_on_error',                 :bool,    :default => nil),
diff --git a/lib/embulk/output/bigquery/bigquery_client.rb b/lib/embulk/output/bigquery/bigquery_client.rb
@@ -79,11 +79,7 @@ def load_from_gcs(object_uris, table)
             begin
               # As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
               # we should generate job_id in client code, otherwise, retrying would cause duplication
-              if @task['prevent_duplicate_insert'] and (@task['mode'] == 'append' or @task['mode'] == 'append_direct')
-                job_id = Helper.create_load_job_id(@task, path, fields)
-              else
-                job_id = "embulk_load_job_#{SecureRandom.uuid}"
-              end
+              job_id = "embulk_load_job_#{SecureRandom.uuid}"
               Embulk.logger.info { "embulk-output-bigquery: Load job starting... job_id:[#{job_id}] #{object_uris} => #{@project}:#{@dataset}.#{table} in #{@location_for_log}" }
 
               body = {
@@ -174,11 +170,7 @@ def load(path, table, write_disposition: 'WRITE_APPEND')
               if File.exist?(path)
                 # As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
                 # we should generate job_id in client code, otherwise, retrying would cause duplication
-                if @task['prevent_duplicate_insert'] and (@task['mode'] == 'append' or @task['mode'] == 'append_direct')
-                  job_id = Helper.create_load_job_id(@task, path, fields)
-                else
-                  job_id = "embulk_load_job_#{SecureRandom.uuid}"
-                end
+                job_id = "embulk_load_job_#{SecureRandom.uuid}"
                 Embulk.logger.info { "embulk-output-bigquery: Load job starting... job_id:[#{job_id}] #{path} => #{@project}:#{@dataset}.#{table} in #{@location_for_log}" }
               else
                 Embulk.logger.info { "embulk-output-bigquery: Load job starting... #{path} does not exist, skipped" }
diff --git a/test/test_configure.rb b/test/test_configure.rb
@@ -62,7 +62,6 @@ def test_configure_default
         assert_equal 3600, task['job_status_max_polling_time']
         assert_equal 10, task['job_status_polling_interval']
         assert_equal false, task['is_skip_job_result_check']
-        assert_equal false, task['prevent_duplicate_insert']
         assert_equal false, task['with_rehearsal']
         assert_equal 1000, task['rehearsal_counts']
         assert_equal [], task['column_options']
diff --git a/test/test_example.rb b/test/test_example.rb
@@ -33,7 +33,6 @@ def embulk_run(config_path)
     files.each do |config_path|
       if %w[
         config_expose_errors.yml
-        config_prevent_duplicate_insert.yml
         ].include?(File.basename(config_path))
         define_method(:"test_#{File.basename(config_path, ".yml")}") do
           assert_false embulk_run(config_path)