|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +# exit on error |
| 4 | +set -e |
| 5 | + |
| 6 | +# ---------------------------------------------------------------------- |
| 7 | +# START CONFIGURATION SECTION |
| 8 | +# ---------------------------------------------------------------------- |
| 9 | + |
| 10 | +# name of the dabase to dump |
| 11 | +# this script assumes the user running it has access to the database |
| 12 | +DATABASE=${DATABASE:-"bety"} |
| 13 | + |
| 14 | +# psql options |
| 15 | +# this allows you to add the user to use as well as any other options |
| 16 | +PG_OPT=${PG_OPT-"-U bety"} |
| 17 | + |
| 18 | +# ID's used in database |
| 19 | +# These ID's need to be unique for the sharing to work. If you want |
| 20 | +# to share your data, send email to kooper@illinois.edu to claim |
| 21 | +# your ID range. The master list is maintained at |
| 22 | +# https://github.com/PecanProject/bety/wiki/Distributed-BETYdb |
| 23 | +# |
| 24 | +# 0 - EBI - David LeBauer |
| 25 | +# 1 - BU - Mike Dietze |
| 26 | +# 2 - Brookhaven - Shawn Serbin |
| 27 | +# 3 - Purdue - Jeanne Osnas |
| 28 | +# 4 - Virginia Tech - Quinn Thomas |
| 29 | +# 99 - VM |
| 30 | +MYSITE=${MYSITE:-99} |
| 31 | + |
| 32 | +# access level requirement |
| 33 | +# 0 - private |
| 34 | +# 4 - public |
| 35 | +LEVEL=${LEVEL:-3} |
| 36 | + |
| 37 | +# dump unchecked traits and yields |
| 38 | +# set this to "YES" to dump all unchecked traits/yields as well |
| 39 | +UNCHECKED=${UNCHECKED:-"NO"} |
| 40 | + |
| 41 | +# keep users |
| 42 | +# set this to YES to dump all user information, otherwise it will |
| 43 | +# be anonymized |
| 44 | +KEEPUSERS=${KEEPUSERS:-"NO"} |
| 45 | + |
| 46 | +# location where to write the results, this will be a tar file |
| 47 | +OUTPUT=${OUTPUT:-"$PWD/dump"} |
| 48 | + |
| 49 | +# Should the process be quiet |
| 50 | +QUIET=${QUIET:-"NO"} |
| 51 | + |
| 52 | +# Should all the data be dumped |
| 53 | +ALLDATA=${ALLDATA:-"NO"} |
| 54 | + |
| 55 | +# ---------------------------------------------------------------------- |
| 56 | +# END CONFIGURATION SECTION |
| 57 | +# ---------------------------------------------------------------------- |
| 58 | + |
| 59 | +# parse command line options |
| 60 | +while getopts ad:hkl:m:o:p:qu opt; do |
| 61 | + case $opt in |
| 62 | + a) |
| 63 | + ALLDATA="YES" |
| 64 | + ;; |
| 65 | + d) |
| 66 | + DATABASE=$OPTARG |
| 67 | + ;; |
| 68 | + h) |
| 69 | + echo "$0 [-a] [-d database] [-h] [-k] [-l 0,1,2,3,4] [-m my siteid] [-o folder] [-p psql options] [-u]" |
| 70 | + echo " -a dump all records" |
| 71 | + echo " -d database, default is bety" |
| 72 | + echo " -h this help page" |
| 73 | + echo " -k keep users, default is to be anonymized" |
| 74 | + echo " -l level of data that can be dumped, default is 3" |
| 75 | + echo " -m site id, default is 99 (VM)" |
| 76 | + echo " -o output folder where dumped data is written, default is dump" |
| 77 | + echo " -p additional psql command line options, default is -U bety" |
| 78 | + echo " -q should the export be quiet" |
| 79 | + echo " -u should unchecked data be dumped, default is NO" |
| 80 | + exit 0 |
| 81 | + ;; |
| 82 | + k) |
| 83 | + KEEPUSERS="YES" |
| 84 | + ;; |
| 85 | + l) |
| 86 | + LEVEL=$OPTARG |
| 87 | + ;; |
| 88 | + m) |
| 89 | + MYSITE=$OPTARG |
| 90 | + ;; |
| 91 | + o) |
| 92 | + OUTPUT=$OPTARG |
| 93 | + ;; |
| 94 | + p) |
| 95 | + PG_OPT=$OPTARG |
| 96 | + ;; |
| 97 | + q) |
| 98 | + QUIET="YES" |
| 99 | + ;; |
| 100 | + u) |
| 101 | + UNCHECKED="YES" |
| 102 | + ;; |
| 103 | + esac |
| 104 | +done |
| 105 | + |
| 106 | +# Table that contains the users, this table will be anonymized |
| 107 | +USER_TABLES="users" |
| 108 | + |
| 109 | +# list of all tables, schema_migrations is ignored since that |
| 110 | +# will be imported during creaton |
| 111 | +CLEAN_TABLES="attributes benchmark_sets benchmarks" |
| 112 | +CLEAN_TABLES="${CLEAN_TABLES} citations covariates cultivars dbfiles" |
| 113 | +CLEAN_TABLES="${CLEAN_TABLES} ensembles entities experiments formats inputs" |
| 114 | +CLEAN_TABLES="${CLEAN_TABLES} likelihoods machines managements metrics" |
| 115 | +CLEAN_TABLES="${CLEAN_TABLES} methods mimetypes models modeltypes" |
| 116 | +CLEAN_TABLES="${CLEAN_TABLES} pfts posteriors priors reference_runs" |
| 117 | +CLEAN_TABLES="${CLEAN_TABLES} runs sites species treatments" |
| 118 | +CLEAN_TABLES="${CLEAN_TABLES} variables workflows" |
| 119 | +CLEAN_TABLES="${CLEAN_TABLES} projects sitegroups" |
| 120 | + |
| 121 | +# tables that have checks that need to be looked at. |
| 122 | +CHECK_TABLES="traits yields" |
| 123 | + |
| 124 | +# tables that have many to many relationships |
| 125 | +MANY_TABLES="benchmarks_benchmarks_reference_runs benchmarks_ensembles" |
| 126 | +MANY_TABLES="${MANY_TABLES} benchmarks_ensembles_scores benchmarks_metrics benchmark_sets_benchmark_reference_runs" |
| 127 | +MANY_TABLES="${MANY_TABLES} citations_sites citations_treatments" |
| 128 | +MANY_TABLES="${MANY_TABLES} cultivars_pfts current_posteriors" |
| 129 | +MANY_TABLES="${MANY_TABLES} experiments_sites experiments_treatments" |
| 130 | +MANY_TABLES="${MANY_TABLES} formats_variables inputs_runs" |
| 131 | +MANY_TABLES="${MANY_TABLES} managements_treatments modeltypes_formats" |
| 132 | +MANY_TABLES="${MANY_TABLES} pfts_priors pfts_species" |
| 133 | +MANY_TABLES="${MANY_TABLES} posterior_samples posteriors_ensembles" |
| 134 | +MANY_TABLES="${MANY_TABLES} sitegroups_sites sites_cultivars trait_covariate_associations" |
| 135 | + |
| 136 | +# tables that should NOT be dumped |
| 137 | +IGNORE_TABLES="sessions" |
| 138 | +SYSTEM_TABLES="schema_migrations spatial_ref_sys" |
| 139 | + |
| 140 | +# be quiet if not interactive |
| 141 | +if ! tty -s ; then |
| 142 | + exec 1>/dev/null |
| 143 | +fi |
| 144 | + |
| 145 | +# this value should be constant, do not change |
| 146 | +ID_RANGE=1000000000 |
| 147 | + |
| 148 | +# make output folder |
| 149 | +mkdir -p "${OUTPUT}" |
| 150 | +DUMPDIR="/tmp/$$" |
| 151 | +mkdir -p "${DUMPDIR}" |
| 152 | +chmod 777 "${DUMPDIR}" |
| 153 | + |
| 154 | +# compute range based on MYSITE |
| 155 | +if [ "${ALLDATA}" != "YES" ]; then |
| 156 | + START_ID=$(( MYSITE * ID_RANGE + 1 )) |
| 157 | + LAST_ID=$(( START_ID + ID_RANGE - 1 )) |
| 158 | + if [ "${QUIET}" != "YES" ]; then |
| 159 | + echo "Dumping all items that have id : [${START_ID} - ${LAST_ID}]" |
| 160 | + fi |
| 161 | + LIMIT="(id >= ${START_ID} AND id <= ${LAST_ID})" |
| 162 | +else |
| 163 | + LIMIT="TRUE" |
| 164 | + if [ "${QUIET}" != "YES" ]; then |
| 165 | + echo "Dumping all items that have id : ALL ITEMS" |
| 166 | + fi |
| 167 | +fi |
| 168 | + |
| 169 | +# find current schema version |
| 170 | +# following returns a triple: |
| 171 | +# - number of migrations |
| 172 | +# - largest migration |
| 173 | +# - hash of all migrations |
| 174 | +MIGRATIONS=$( psql ${PG_OPT} -t -q -d "${DATABASE}" -c 'SELECT COUNT(version) FROM schema_migrations' | tr -d ' ' ) |
| 175 | +VERSION=$( psql ${PG_OPT} -t -q -d "${DATABASE}" -c 'SELECT md5(array_agg(version)::text) FROM (SELECT version FROM schema_migrations ORDER BY version) as v;' | tr -d ' ' ) |
| 176 | +LATEST=$( psql ${PG_OPT} -t -q -d "${DATABASE}" -c 'SELECT version FROM schema_migrations ORDER BY version DESC LIMIT 1' | tr -d ' ' ) |
| 177 | +NOW=$( date -u +"%Y-%m-%dT%H:%M:%SZ" ) |
| 178 | +echo "${MIGRATIONS} ${VERSION} ${LATEST} ${NOW}" > "${OUTPUT}/version.txt" |
| 179 | + |
| 180 | +# dump schema |
| 181 | +if [ "${QUIET}" != "YES" ]; then |
| 182 | + printf "Dumping %-25s : " "schema" |
| 183 | +fi |
| 184 | +pg_dump ${PG_OPT} -s "${DATABASE}" -O -x > "${DUMPDIR}/${VERSION}.schema" |
| 185 | +if [ "${QUIET}" != "YES" ]; then |
| 186 | + echo "DUMPED version ${VERSION} with ${MIGRATIONS}, latest migration is ${LATEST}" |
| 187 | +fi |
| 188 | + |
| 189 | +# dump ruby special table |
| 190 | +if [ "${QUIET}" != "YES" ]; then |
| 191 | + printf "Dumping %-25s : " "schema_migrations" |
| 192 | +fi |
| 193 | +ADD=$( psql ${PG_OPT} -t -q -d "${DATABASE}" -c "SELECT count(*) FROM schema_migrations;" | tr -d ' ' ) |
| 194 | +psql ${PG_OPT} -t -q -d "${DATABASE}" -c "\COPY schema_migrations TO '${DUMPDIR}/schema_migrations.csv' WITH (DELIMITER ' ', NULL '\\N', ESCAPE '\\', FORMAT CSV, ENCODING 'UTF-8')" |
| 195 | +if [ "${QUIET}" != "YES" ]; then |
| 196 | + echo "DUMPED ${ADD}" |
| 197 | +fi |
| 198 | + |
| 199 | +# skip following tables |
| 200 | +# - inputs_runs (PEcAn, site specific) |
| 201 | +# - posteriors_runs (PEcAn, site specific, is this used?) |
| 202 | +# - runs (PEcAn, site specific) |
| 203 | +# - workflows (PEcAn, site specific) |
| 204 | + |
| 205 | +# dump users |
| 206 | +if [ "${QUIET}" != "YES" ]; then |
| 207 | + printf "Dumping %-25s : " "users" |
| 208 | +fi |
| 209 | +if [ "${KEEPUSERS}" == "YES" ]; then |
| 210 | + psql ${PG_OPT} -t -q -d "${DATABASE}" -c "\COPY (SELECT * FROM ${USER_TABLES} WHERE ${LIMIT}) TO '${DUMPDIR}/users.csv' WITH (DELIMITER ' ', NULL '\\N', ESCAPE '\\', FORMAT CSV, ENCODING 'UTF-8')" |
| 211 | +else |
| 212 | + psql ${PG_OPT} -t -q -d "${DATABASE}" -c "\COPY (SELECT id, CONCAT('user', id) AS login, CONCAT('user ' , id) AS name, CONCAT('betydb+', id, '@gmail.com') as email, 'Urbana' AS city, 'USA' AS country, '' AS area, '1234567890abcdef' AS crypted_password, 'BU' AS salt, NOW() AS created_at, NOW() AS updated_at, NULL as remember_token, NULL AS remember_token_expires_at, 3 AS access_level, 4 AS page_access_level, NULL AS apikey, 'IL' AS state_prov, '61801' AS postal_code FROM ${USER_TABLES} WHERE ${LIMIT}) TO '${DUMPDIR}/users.csv' WITH (DELIMITER ' ', NULL '\\N', ESCAPE '\\', FORMAT CSV, ENCODING 'UTF-8')" |
| 213 | +fi |
| 214 | +ADD=$( psql ${PG_OPT} -t -q -d "${DATABASE}" -c "SELECT count(*) FROM ${USER_TABLES} WHERE ${LIMIT};" | tr -d ' ' ) |
| 215 | +if [ "${QUIET}" != "YES" ]; then |
| 216 | + echo "DUMPED ${ADD}" |
| 217 | +fi |
| 218 | + |
| 219 | +# unrestricted tables |
| 220 | +for T in ${CLEAN_TABLES} ${MANY_TABLES}; do |
| 221 | + if [ "${QUIET}" != "YES" ]; then |
| 222 | + printf "Dumping %-25s : " "${T}" |
| 223 | + fi |
| 224 | + psql ${PG_OPT} -t -q -d "${DATABASE}" -c "\COPY (SELECT * FROM ${T} WHERE ${LIMIT}) TO '${DUMPDIR}/${T}.csv' WITH (DELIMITER ' ', NULL '\\N', ESCAPE '\\', FORMAT CSV, ENCODING 'UTF-8')" |
| 225 | + ADD=$( psql ${PG_OPT} -t -q -d "${DATABASE}" -c "SELECT count(*) FROM ${T} WHERE ${LIMIT}" | tr -d ' ' ) |
| 226 | + if [ "${QUIET}" != "YES" ]; then |
| 227 | + echo "DUMPED ${ADD}" |
| 228 | + fi |
| 229 | +done |
| 230 | + |
| 231 | +# restricted and unchecked tables |
| 232 | +for T in ${CHECK_TABLES}; do |
| 233 | + if [ "${QUIET}" != "YES" ]; then |
| 234 | + printf "Dumping %-25s : " "${T}" |
| 235 | + fi |
| 236 | + if [ "${UNCHECKED}" == "YES" ]; then |
| 237 | + UNCHECKED_QUERY="" |
| 238 | + else |
| 239 | + UNCHECKED_QUERY="AND checked != -1" |
| 240 | + fi |
| 241 | + psql ${PG_OPT} -t -q -d "${DATABASE}" -c "\COPY (SELECT * FROM ${T} WHERE ${LIMIT} AND access_level >= ${LEVEL} ${UNCHECKED_QUERY}) TO '${DUMPDIR}/${T}.csv' WITH (DELIMITER ' ', NULL '\\N', ESCAPE '\\', FORMAT CSV, ENCODING 'UTF-8');" |
| 242 | + ADD=$( psql ${PG_OPT} -t -q -d "${DATABASE}" -c "SELECT count(*) FROM ${T} WHERE ${LIMIT}" | tr -d ' ' ) |
| 243 | + if [ "${QUIET}" != "YES" ]; then |
| 244 | + echo "DUMPED ${ADD}" |
| 245 | + fi |
| 246 | +done |
| 247 | + |
| 248 | +# all done dumping database |
| 249 | +tar zcf "${OUTPUT}/bety.tar.gz" -C "${DUMPDIR}" . |
| 250 | +rm -rf "${DUMPDIR}" |
0 commit comments