Skip to content

Commit b1095b7

Browse files
authored
feat(firestore-bigquery-export): import script updates (#2264)
* fix(bq-import-script): fix path_param extraction * fix(bq-import-script): enforce batching in multi-thread approach * feat(bq-import-script): add failed output json option * fix(bq-import-script): fix cross-project issue * fix(bq-import-script): delete empty failed json docs * fix(bq-import-script): write failed outputs to txt format instead * docs(bq-import-script): bump version and update guide * refactor(bq-import-script): clean up and bump version
1 parent 617caa7 commit b1095b7

File tree

14 files changed

+7678
-1923
lines changed

14 files changed

+7678
-1923
lines changed

firestore-bigquery-export/guides/IMPORT_EXISTING_DOCUMENTS.md

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,3 +89,53 @@ Run the import script using [`npx` (the Node Package Runner)](https://www.npmjs.
8989
```
9090

9191
The result set will contain the number of documents in your source collection.
92+
93+
### Handling Failed Batches (`-f, --failed-batch-output`)
94+
95+
#### Overview
96+
97+
If any document batches fail to import due to errors, you can use the `-f` or `--failed-batch-output` option to specify a file where failed document paths will be recorded. This allows you to review and retry failed imports later.
98+
99+
---
100+
101+
#### Usage
102+
103+
```sh
104+
npx @firebaseextensions/fs-bq-import-collection -f failed_batches.txt
105+
```
106+
107+
In the example above, any documents that fail to import will have their paths written to `failed_batches.txt`.
108+
109+
---
110+
111+
#### Example Output
112+
113+
If some documents fail, the output file will contain paths like:
114+
115+
```
116+
projects/my-project/databases/(default)/documents/users/user123
117+
projects/my-project/databases/(default)/documents/orders/order456
118+
projects/my-project/databases/(default)/documents/posts/post789
119+
```
120+
121+
Each line corresponds to a document that failed to import.
122+
123+
---
124+
125+
#### Console Logging of Failed Batches
126+
127+
The import script will also log failed imports to the console. You may see output like this:
128+
129+
```
130+
Failed batch: <paths of failed documents in batch>
131+
```
132+
133+
This helps you quickly identify problematic documents and take action accordingly.
134+
135+
---
136+
137+
#### Retrying Failed Imports
138+
139+
To retry the failed imports, you can use the output file to manually inspect or reprocess the documents. For example, you could create a script that reads the failed paths and reattempts the import.
140+
141+
> **Note:** If the specified file already exists, it will be **cleared** before writing new failed batch paths.
Lines changed: 331 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,331 @@
1+
import { ChangeType } from "@firebaseextensions/firestore-bigquery-change-tracker";
2+
import { getRowsFromDocs } from "../src/helper";
3+
4+
describe("getRowsFromDocs", () => {
5+
it("transforms basic Firestore documents into correct row format", () => {
6+
const mockDocs = [
7+
{
8+
id: "doc1",
9+
ref: {
10+
path: "users/doc1",
11+
},
12+
data: () => ({
13+
name: "John Doe",
14+
age: 30,
15+
}),
16+
},
17+
] as any[];
18+
19+
const mockConfig = {
20+
projectId: "test-project",
21+
sourceCollectionPath: "users",
22+
queryCollectionGroup: false,
23+
} as any;
24+
25+
const beforeTimestamp = new Date().toISOString();
26+
const result = getRowsFromDocs(mockDocs, mockConfig);
27+
const afterTimestamp = new Date().toISOString();
28+
29+
expect(result).toHaveLength(1);
30+
expect(result[0]).toMatchObject({
31+
operation: ChangeType.IMPORT,
32+
documentName:
33+
"projects/test-project/databases/(default)/documents/users/doc1",
34+
documentId: "doc1",
35+
pathParams: {},
36+
eventId: "",
37+
data: {
38+
name: "John Doe",
39+
age: 30,
40+
},
41+
});
42+
43+
expect(result[0].timestamp >= beforeTimestamp).toBeTruthy();
44+
expect(result[0].timestamp <= afterTimestamp).toBeTruthy();
45+
});
46+
47+
it("correctly extracts wildcard parameters from document paths", () => {
48+
const mockDocs = [
49+
{
50+
id: "paris",
51+
ref: {
52+
path: "regions/europe/countries/france/cities/paris",
53+
},
54+
data: () => ({
55+
population: 2161000,
56+
isCapital: true,
57+
}),
58+
},
59+
] as any[];
60+
61+
const mockConfig = {
62+
projectId: "test-project",
63+
sourceCollectionPath: "regions/{regionId}/countries/{countryId}/cities",
64+
queryCollectionGroup: false,
65+
} as any;
66+
67+
const result = getRowsFromDocs(mockDocs, mockConfig);
68+
69+
expect(result).toHaveLength(1);
70+
expect(result[0].pathParams).toEqual({
71+
regionId: "europe",
72+
countryId: "france",
73+
});
74+
expect(result[0].documentName).toBe(
75+
"projects/test-project/databases/(default)/documents/regions/europe/countries/france/cities/paris"
76+
);
77+
expect(result[0].documentId).toBe("paris");
78+
expect(result[0].data).toEqual({
79+
population: 2161000,
80+
isCapital: true,
81+
});
82+
});
83+
84+
it("correctly extracts wildcard parameters from document paths 2", () => {
85+
const mockDocs = [
86+
{
87+
id: "paris",
88+
ref: {
89+
path: "my/cool/collection/doc1",
90+
},
91+
data: () => ({
92+
population: 2161000,
93+
isCapital: true,
94+
}),
95+
},
96+
] as any[];
97+
98+
const mockConfig = {
99+
projectId: "test-project",
100+
sourceCollectionPath: "my/{testId}/collection",
101+
queryCollectionGroup: false,
102+
} as any;
103+
104+
const result = getRowsFromDocs(mockDocs, mockConfig);
105+
106+
expect(result).toHaveLength(1);
107+
expect(result[0].pathParams).toEqual({
108+
testId: "cool",
109+
});
110+
expect(result[0].documentName).toBe(
111+
"projects/test-project/databases/(default)/documents/my/cool/collection/doc1"
112+
);
113+
expect(result[0].documentId).toBe("paris");
114+
expect(result[0].data).toEqual({
115+
population: 2161000,
116+
isCapital: true,
117+
});
118+
});
119+
120+
it("handles collection group queries correctly", () => {
121+
// These documents have the same collection name 'users' but at different paths
122+
const mockDocs = [
123+
{
124+
id: "user1",
125+
ref: {
126+
path: "organizations/org1/users/user1",
127+
},
128+
data: () => ({
129+
name: "John",
130+
}),
131+
},
132+
{
133+
id: "user2",
134+
ref: {
135+
path: "organizations/org2/users/user2",
136+
},
137+
data: () => ({
138+
name: "Jane",
139+
}),
140+
},
141+
{
142+
id: "user3",
143+
ref: {
144+
path: "teams/team1/users/user3", // Different parent path
145+
},
146+
data: () => ({
147+
name: "Bob",
148+
}),
149+
},
150+
] as any[];
151+
152+
const mockConfig = {
153+
projectId: "test-project",
154+
sourceCollectionPath: "organizations/{orgId}/users", // Template path
155+
queryCollectionGroup: true,
156+
} as any;
157+
158+
const result = getRowsFromDocs(mockDocs, mockConfig);
159+
160+
// Should only include documents that match the template path pattern
161+
expect(result).toHaveLength(2);
162+
163+
// First document should match and have correct path params
164+
expect(result[0].pathParams).toEqual({
165+
orgId: "org1",
166+
});
167+
expect(result[0].documentName).toBe(
168+
"projects/test-project/databases/(default)/documents/organizations/org1/users/user1"
169+
);
170+
171+
// Second document should match and have correct path params
172+
expect(result[1].pathParams).toEqual({
173+
orgId: "org2",
174+
});
175+
expect(result[1].documentName).toBe(
176+
"projects/test-project/databases/(default)/documents/organizations/org2/users/user2"
177+
);
178+
179+
// The third document (teams/team1/users/user3) should have been filtered out
180+
// as it doesn't match the template path pattern
181+
});
182+
183+
it("handles collection group queries with underscore paths correctly", () => {
184+
// Test collection group queries with both regular and underscore paths
185+
const mockDocs = [
186+
{
187+
id: "doc1",
188+
ref: {
189+
path: "my/test1/collection/doc1",
190+
},
191+
data: () => ({
192+
value: 1,
193+
}),
194+
},
195+
{
196+
id: "doc2",
197+
ref: {
198+
path: "my_other/test2/collection/doc2",
199+
},
200+
data: () => ({
201+
value: 2,
202+
}),
203+
},
204+
{
205+
id: "doc3",
206+
ref: {
207+
path: "different/test3/collection/doc3",
208+
},
209+
data: () => ({
210+
value: 3,
211+
}),
212+
},
213+
] as any[];
214+
215+
// Test with my/{coolId}/collection
216+
const myConfig = {
217+
projectId: "test-project",
218+
sourceCollectionPath: "my/{coolId}/collection",
219+
queryCollectionGroup: true,
220+
} as any;
221+
222+
const myResult = getRowsFromDocs(mockDocs, myConfig);
223+
expect(myResult).toHaveLength(1);
224+
expect(myResult[0].pathParams).toEqual({
225+
coolId: "test1",
226+
});
227+
expect(myResult[0].documentName).toBe(
228+
"projects/test-project/databases/(default)/documents/my/test1/collection/doc1"
229+
);
230+
231+
// Test with my_other/{coolId}/collection
232+
const myOtherConfig = {
233+
projectId: "test-project",
234+
sourceCollectionPath: "my_other/{coolId}/collection",
235+
queryCollectionGroup: true,
236+
} as any;
237+
238+
const myOtherResult = getRowsFromDocs(mockDocs, myOtherConfig);
239+
expect(myOtherResult).toHaveLength(1);
240+
expect(myOtherResult[0].pathParams).toEqual({
241+
coolId: "test2",
242+
});
243+
expect(myOtherResult[0].documentName).toBe(
244+
"projects/test-project/databases/(default)/documents/my_other/test2/collection/doc2"
245+
);
246+
});
247+
248+
it("handles collection group queries with large batches correctly", () => {
249+
// Create a large batch of mixed documents to simulate real conditions
250+
const mockDocs = [
251+
// First batch - should match my_other/{coolId}/collection
252+
...Array(500)
253+
.fill(null)
254+
.map((_, i) => ({
255+
id: `doc${i}`,
256+
ref: {
257+
path: `my_other/test${i}/collection/doc${i}`,
258+
},
259+
data: () => ({
260+
value: i,
261+
}),
262+
})),
263+
// Second batch - should not match
264+
...Array(500)
265+
.fill(null)
266+
.map((_, i) => ({
267+
id: `other${i}`,
268+
ref: {
269+
path: `different/test${i}/collection/other${i}`,
270+
},
271+
data: () => ({
272+
value: i,
273+
}),
274+
})),
275+
// Third batch - should match my/{coolId}/collection
276+
...Array(500)
277+
.fill(null)
278+
.map((_, i) => ({
279+
id: `another${i}`,
280+
ref: {
281+
path: `my/test${i}/collection/another${i}`,
282+
},
283+
data: () => ({
284+
value: i,
285+
}),
286+
})),
287+
] as any[];
288+
289+
// Test my_other/{coolId}/collection
290+
const myOtherConfig = {
291+
projectId: "test-project",
292+
sourceCollectionPath: "my_other/{coolId}/collection",
293+
queryCollectionGroup: true,
294+
} as any;
295+
296+
const myOtherResult = getRowsFromDocs(mockDocs, myOtherConfig);
297+
expect(myOtherResult).toHaveLength(500);
298+
expect(myOtherResult[0].documentName).toContain(
299+
"my_other/test0/collection"
300+
);
301+
expect(myOtherResult[0].pathParams).toEqual({ coolId: "test0" });
302+
303+
// Test my/{coolId}/collection with same batch
304+
const myConfig = {
305+
projectId: "test-project",
306+
sourceCollectionPath: "my/{coolId}/collection",
307+
queryCollectionGroup: true,
308+
} as any;
309+
310+
const myResult = getRowsFromDocs(mockDocs, myConfig);
311+
expect(myResult).toHaveLength(500);
312+
expect(myResult[0].documentName).toContain("my/test0/collection");
313+
expect(myResult[0].pathParams).toEqual({ coolId: "test0" });
314+
315+
// Additional assertions to verify no overlap
316+
const myOtherPaths = new Set(myOtherResult.map((r) => r.documentName));
317+
const myPaths = new Set(myResult.map((r) => r.documentName));
318+
const intersection = [...myOtherPaths].filter((x) => myPaths.has(x));
319+
expect(intersection).toHaveLength(0);
320+
321+
// Verify all my_other paths start correctly
322+
myOtherResult.forEach((row) => {
323+
expect(row.documentName).toContain("my_other/");
324+
});
325+
326+
// Verify all my paths start correctly
327+
myResult.forEach((row) => {
328+
expect(row.documentName).toContain("/my/");
329+
});
330+
});
331+
});

0 commit comments

Comments
 (0)