1+ schema_version : " 1.0.0"
2+ schema_date : " 2025-06-22"
3+ description : " Database metadata schema for recodeflow - defines Dublin Core compliant dataset-level metadata for databases and data collections."
4+ registry_file : " metadata_registry.yaml"
5+
6+ # Note: YAML format specifications are defined in metadata_registry.yaml to maintain DRY principles
7+
8+ database_metadata_schema :
9+ title : " Database metadata configuration"
10+ description : " Defines dataset-level metadata following Dublin Core standards for database documentation and cataloging."
11+
12+ standard : " Dublin Core with DCAT extensions"
13+ target_format : " YAML metadata files"
14+
15+ # Field definitions following Dublin Core standard
16+ fields :
17+ # ============================================================================
18+ # CORE DUBLIN CORE FIELDS - Essential dataset documentation
19+ # ============================================================================
20+
21+ - name : " title"
22+ title : " Dataset title"
23+ description : " Name of the dataset."
24+ type : " string"
25+ tier : " core"
26+ dublin_core_element : " dc:title"
27+ constraints :
28+ required : true
29+ notes : |
30+ Provide a clear, concise name for the dataset.
31+ Examples: "Health Survey 2024", "Primary Biliary Cirrhosis (PBC) Data Set"
32+
33+ - name : " description"
34+ title : " Dataset description"
35+ description : " Detailed explanation of the dataset."
36+ type : " string"
37+ tier : " core"
38+ dublin_core_element : " dc:description"
39+ constraints :
40+ required : true
41+ notes : |
42+ Comprehensive description of the dataset including purpose, scope, and methodology.
43+ Should be sufficient for users to understand if the dataset meets their needs.
44+
45+ - name : " creator"
46+ title : " Dataset creator"
47+ description : " Person or organization responsible for creating the data."
48+ type : " array"
49+ tier : " core"
50+ dublin_core_element : " dc:creator"
51+ constraints :
52+ required : true
53+ item_structure :
54+ name : " Creator name"
55+ affiliation : " Creator affiliation (optional)"
56+ orcid : " ORCID identifier (optional)"
57+ notes : |
58+ Attribution of data origin and responsibility.
59+ Examples: "Mayo Clinic", "RecodeFlow Team", "Statistics Canada"
60+
61+ - name : " publisher"
62+ title : " Dataset publisher"
63+ description : " Organization publishing the data."
64+ type : " string"
65+ tier : " core"
66+ dublin_core_element : " dc:publisher"
67+ constraints :
68+ required : true
69+ notes : |
70+ Identifies the official data publisher or distributing organization.
71+ Examples: "Public Health Agency", "Mayo Clinic", "CRAN"
72+
73+ - name : " subject"
74+ title : " Dataset subject"
75+ description : " Topics covered by the dataset."
76+ type : " array"
77+ tier : " core"
78+ dublin_core_element : " dc:subject"
79+ constraints :
80+ required : true
81+ notes : |
82+ Categorize dataset's thematic content using relevant keywords or controlled vocabularies.
83+ Examples: ["primary biliary cirrhosis", "clinical study", "medical research"]
84+
85+ - name : " date_created"
86+ title : " Creation date"
87+ description : " Dataset creation date."
88+ type : " string"
89+ tier : " core"
90+ dublin_core_element : " dc:date"
91+ format : " date"
92+ constraints :
93+ required : true
94+ pattern : " ^[0-9]{4}-[0-9]{2}-[0-9]{2}$"
95+ notes : |
96+ Use ISO date format: YYYY-MM-DD
97+ Track dataset's initial creation date.
98+
99+ - name : " date_modified"
100+ title : " Last modification date"
101+ description : " Date when dataset was last modified."
102+ type : " string"
103+ tier : " optional"
104+ dublin_core_element : " dcterms:modified"
105+ format : " date"
106+ constraints :
107+ pattern : " ^[0-9]{4}-[0-9]{2}-[0-9]{2}$"
108+ notes : |
109+ Use ISO date format: YYYY-MM-DD
110+ Track most recent updates to the dataset.
111+
112+ - name : " version"
113+ title : " Dataset version"
114+ description : " Version number of the dataset."
115+ type : " string"
116+ tier : " optional"
117+ dublin_core_element : " dcterms:hasVersion"
118+ constraints :
119+ pattern : " ^[0-9]+\\ .[0-9]+\\ .[0-9]+$"
120+ notes : |
121+ Track dataset iterations using semantic versioning (e.g., 1.0.0).
122+ Increment for changes: major.minor.patch
123+
124+ - name : " license"
125+ title : " Licensing information"
126+ description : " Licensing and usage rights information."
127+ type : " string"
128+ tier : " core"
129+ dublin_core_element : " dc:rights"
130+ constraints :
131+ required : true
132+ notes : |
133+ Specify usage and distribution rights clearly.
134+ Examples: "CC-BY 4.0", "Open Source", "Restricted - Contact Publisher"
135+
136+ - name : " contact_point"
137+ title : " Dataset contact"
138+ description : " Contact information for dataset inquiries."
139+ type : " string"
140+ tier : " core"
141+ dublin_core_element : " dcat:contactPoint"
142+ constraints :
143+ required : true
144+ notes : |
145+ Provide communication channel for questions about the dataset.
146+ Examples: "support@example.org", "researcher@institution.edu"
147+
148+ # ============================================================================
149+ # EXTENDED DUBLIN CORE / DCAT FIELDS - Enhanced metadata
150+ # ============================================================================
151+
152+ - name : " type"
153+ title : " Dataset type"
154+ description : " Type or nature of the dataset."
155+ type : " string"
156+ tier : " optional"
157+ dublin_core_element : " dc:type"
158+ constraints :
159+ enum : ["Dataset", "Survey", "Clinical Trial", "Administrative Data", "Registry"]
160+ notes : |
161+ Classify the nature of the data collection.
162+
163+ - name : " format"
164+ title : " Dataset format"
165+ description : " Physical or digital manifestation of the dataset."
166+ type : " string"
167+ tier : " optional"
168+ dublin_core_element : " dc:format"
169+ notes : |
170+ Describe the format and structure of the data.
171+ Examples: "Tabular data", "CSV files", "R data frames"
172+
173+ - name : " identifier"
174+ title : " Dataset identifier"
175+ description : " Unique identifier for the dataset."
176+ type : " array"
177+ tier : " optional"
178+ dublin_core_element : " dc:identifier"
179+ item_structure :
180+ type : " Identifier type"
181+ value : " Identifier value"
182+ notes : |
183+ Provide unique identifiers for referencing the dataset.
184+ Examples: DOI, package name, institutional ID
185+
186+ - name : " source"
187+ title : " Dataset source"
188+ description : " Source or origin of the dataset."
189+ type : " string"
190+ tier : " optional"
191+ dublin_core_element : " dc:source"
192+ notes : |
193+ Reference to the original source or related datasets.
194+ Examples: URLs, publications, parent datasets
195+
196+ - name : " language"
197+ title : " Dataset language"
198+ description : " Language(s) used in the dataset."
199+ type : " string"
200+ tier : " optional"
201+ dublin_core_element : " dc:language"
202+ constraints :
203+ pattern : " ^[a-z]{2}(-[A-Z]{2})?$"
204+ notes : |
205+ Use ISO 639-1 language codes (e.g., "en", "fr", "en-CA").
206+
207+ - name : " relation"
208+ title : " Related resources"
209+ description : " Relationships to other datasets or resources."
210+ type : " array"
211+ tier : " optional"
212+ dublin_core_element : " dc:relation"
213+ item_structure :
214+ type : " Relationship type"
215+ identifier : " Related resource identifier"
216+ description : " Description of relationship"
217+ notes : |
218+ Document connections to related datasets, publications, or projects.
219+
220+ - name : " coverage"
221+ title : " Dataset coverage"
222+ description : " Spatial or temporal coverage of the dataset."
223+ type : " object"
224+ tier : " optional"
225+ dublin_core_element : " dc:coverage"
226+ structure :
227+ spatial : " Geographic coverage"
228+ temporal : " Time period coverage"
229+ notes : |
230+ Specify the scope of data collection in space and time.
231+
232+ # ============================================================================
233+ # RECODEFLOW-SPECIFIC EXTENSIONS - Integration metadata
234+ # ============================================================================
235+
236+ - name : " recodeflow_integration"
237+ title : " Recodeflow integration metadata"
238+ description : " Metadata specific to recodeflow usage and integration."
239+ type : " object"
240+ tier : " extension"
241+ structure :
242+ variables_file : " Associated variables.csv file"
243+ variable_details_file : " Associated variable_details.csv file"
244+ harmonization_notes : " Notes about harmonization approach"
245+ rec_with_table_compatible : " Boolean indicating compatibility"
246+ notes : |
247+ Integration metadata for recodeflow workflow compatibility.
248+ Links database metadata to associated variable definition files.
249+
250+ # Usage patterns
251+ usage_patterns :
252+ metadata_files :
253+ description : " YAML files alongside data files for metadata documentation."
254+ naming_convention : " {dataset_name}_metadata.yaml"
255+ examples : ["pbc_metadata.yaml", "cchs2017_metadata.yaml"]
256+
257+ # Validation and quality
258+ validation_notes : |
259+ - All required Dublin Core fields must be present
260+ - Date fields must follow ISO 8601 format (YYYY-MM-DD)
261+ - Language codes must follow ISO 639-1 standard
262+ - Contact points should be valid email addresses or URLs
263+ - Version numbers should follow semantic versioning when provided
264+
265+ # Note: Missing data handling, validation modes, and shared specifications
266+ # are defined in metadata_registry.yaml
0 commit comments