Skip to content

Commit f8b0f6f

Browse files
feature/active student prototype v0 (#115)
2 parents 2f3a798 + 84ff972 commit f8b0f6f

12 files changed

+418
-22
lines changed

dbt/models/intermediate/_intermediate__models.yml

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,80 @@
11
version: 2
22

33
models:
4+
- name: int_daily_summary_user_level_activity
5+
description: "This model summarizes user-level activity per user per day. Specifically, this summarizes the number of first-time interactions a user had with levels on a given day. It does not count re-visits to levels."
6+
columns:
7+
- name: user_id
8+
description: "The Code.org user id for this user."
9+
tests:
10+
- not_null
11+
- name: activity_date
12+
description: "The date (day) of the user's activity. Extracted and cast from the user_level record `created_at` timestamp. Multiple records per day are aggregated and summarized for each day."
13+
tests:
14+
- not_null
15+
- name: course_list
16+
description: "A concatenated list of unique course names that the user has interacted with on a particular day, ordered alphabetically."
17+
- name: num_user_level_records
18+
description: "The count of user-level records created in the database for the user on the specified activity date. This indicates the number of levels a user interacted with for the first time on this day."
19+
tests:
20+
- dbt_utils.unique_combination_of_columns:
21+
combination_of_columns:
22+
- user_id
23+
- activity_date
24+
25+
26+
- name: int_daily_summary_sign_ins
27+
description: "This model summarizes the number of sign-ins per user per day. It tracks the number of times a user logs in per day."
28+
columns:
29+
- name: user_id
30+
description: "The Code.org user id for this user."
31+
tests:
32+
- not_null
33+
- name: activity_date
34+
description: "The date (day) of the user's sign-in. Extracted and cast from the `sign_in_at` timestamp. The model aggregates multiple sign-ins per day to summarize the first sign-in of the day."
35+
tests:
36+
- not_null
37+
- name: num_records
38+
description: "The count of sign-in records for the user on the specified activity date. Represents the number of times the user generated a new session / sign-in record on this day."
39+
40+
tests:
41+
- dbt_utils.unique_combination_of_columns:
42+
combination_of_columns:
43+
- user_id
44+
- activity_date
45+
46+
- name: int_daily_summary_project_activity
47+
description: |
48+
This model aggregates and summarizes the number and types of **new projects** each user created per day.
49+
It lists unique project types engaged per day and counts the total project records created on that day.
50+
51+
_(A future modification of this model could include updates to projects, rather that just new project creations, but we would need to start logging that. As of 5.15.24 that data is not being collected)_
52+
columns:
53+
- name: cdo_user_id
54+
description: "The Code.org user_id (via their project_storage_id)"
55+
56+
- name: project_id
57+
description: "The project id associated with the creator of the project (use this when cdo_user_id is null for anonymous projects )"
58+
- name: user_id_merged
59+
description: The `cdo_user_id` if a known code.org user id exists, otherwise value is `'project_id_' || project_id`. This 'merged' user id serves as a unique identifer accross all projects for known code.org users, and anonymous.
60+
61+
- name: activity_date
62+
description: "The date (day) when the project record was created, derived from the `created_at` timestamp."
63+
tests:
64+
- not_null
65+
- name: project_types
66+
description: "A comma-separated list of distinct project types (applab, gamelab, etc.) the user engaged with on a particular day, ordered alphabetically."
67+
- name: num_project_records
68+
description: "The count of projects created associated with the user on the specified activity date."
69+
70+
tests:
71+
- dbt_utils.unique_combination_of_columns:
72+
combination_of_columns:
73+
- user_id_merged
74+
- activity_date
75+
76+
77+
478
- name: int_active_sections
579
description: all sections passing the "active" threshold in a given school year (5+ students completing 1+ levels of same course)
680
tests:
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
/*
2+
3+
Note: projects differ from other records of user activity on the platform (e.g. sign_ins, user_levels) in that it does
4+
track anonymous usage.
5+
6+
A project gets a storage_id associated with a user (presumably tied to browser session, but need to verify) and when a
7+
code.org user is actually signed in there is mapping of storage_ids to user_ids (user_project_storage_ids).
8+
9+
In this model we create a user_id_merged that is the code.org user_id if exists and the storage_id otherwise, prepending 'storage_id_' to make it obvious.
10+
11+
*/
12+
select
13+
upsi.user_id as cdo_user_id, --alias to avoid someone accidentally joining to user_id without realizing this can be null in the case of anonymous users. Use user_id_merged for joins instead.
14+
p.storage_id,
15+
created_at::date as activity_date,
16+
17+
-- storage_ids look too much like user_ids and may have collisions/false positive matches to user_id if we just did a straing coalesce.
18+
-- So prepend 'storage_id_' to avoid accidental joins
19+
coalesce(
20+
upsi.user_id::varchar, 'storage_id_' || p.storage_id
21+
) as user_id_merged,
22+
case when upsi.user_id is not NULL then 1 else 0 end as known_cdo_user,
23+
listagg(distinct project_type, ', ') within group (
24+
order by project_type
25+
) as project_types,
26+
count(*) as num_project_records
27+
28+
from {{ ref('stg_dashboard_pii__projects') }} as p
29+
left join
30+
{{ ref('stg_dashboard__user_project_storage_ids') }} as upsi
31+
on p.storage_id = upsi.user_project_storage_id
32+
where
33+
--remove this filter before publish, make incremental?
34+
created_at >= '2022-07-01'
35+
{{ dbt_utils.group_by(5) }}
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
select
2+
user_id,
3+
sign_in_at::date activity_date,
4+
count(*) num_sign_ins
5+
from {{ ref('stg_dashboard__sign_ins') }}
6+
where trunc(sign_in_at) between '2022-01-01' and sysdate --remove this filter before publish, make incremental?
7+
group by 1,2
8+
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
/*
2+
This model summarizes user_level activity per user per day.
3+
*/
4+
select
5+
ul.user_id,
6+
ul.created_at::date activity_date,
7+
--ul.updated_at::date -- would be better if we could log updated_at for daily activity
8+
9+
listagg(distinct cs.course_name_true) within group (order by cs.course_name_true) course_list,
10+
count(*) num_user_level_records
11+
12+
from {{ ref('stg_dashboard__user_levels') }} ul
13+
left join {{ ref('dim_course_structure') }} cs
14+
on cs.level_id = ul.level_id
15+
and cs.script_id = ul.script_id
16+
where
17+
trunc(ul.created_at) > '2022-01-01' --remove this filter before publish, make incremental?
18+
and ul.created_at <= (select max(created_at) from {{ ref('dim_users')}}) -- don't pull records for which we don't have user_ids (this will all be records and accounts created within the last ~24 hours or so)
19+
group by 1,2

dbt/models/marts/metrics/_metrics__models.yml

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,47 @@
11
version: 2
22

33
models:
4+
- name: fct_active_students_daily
5+
description: "number of active students per day, segmented by country"
6+
columns:
7+
- name: date
8+
description: "the date (day) of activity"
9+
tests:
10+
- not_null
11+
- name: country
12+
description: "the country of the users as determined by code.org geolocation"
13+
- name: us_intl
14+
description: "'us' if the `country` = 'United States', 'intl' if not (there are a handful of Nulls)"
15+
- name: num_actives
16+
description: the number of active students on given date, in given country
17+
tests:
18+
- dbt_utils.unique_combination_of_columns:
19+
combination_of_columns:
20+
- date
21+
- country
22+
- user_type_merged
23+
24+
- name: fct_active_students_monthly
25+
description: "number of active students per month (Jan, Feb, Mar, etc.), segmented by country"
26+
27+
columns:
28+
- name: month_year
29+
description: "The month and year of activity, truncated/aggregated to first of the month. e.g. all data for January, 2022 is given as month_year = '2022-01-01' "
30+
tests:
31+
- not_null
32+
- name: country
33+
description: "the country of the users as determined by code.org geolocation"
34+
- name: us_intl
35+
description: "'us' if the `country` = 'United States', 'intl' if not (there are a handful of Nulls)"
36+
- name: num_actives
37+
description: the number of active students on given month, in given country
38+
tests:
39+
- dbt_utils.unique_combination_of_columns:
40+
combination_of_columns:
41+
- month_year
42+
- country
43+
- user_type_merged
444

5-
# Weekly School Acquisition Metrics
645
- name: fct_weekly_school_acquisition_metrics
746
description:
847
Weekly counts of school acquisitions by status (new, retained, reacquired, inactive, churn, market).
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
with actives as (
2+
select *
3+
from {{ ref('dim_active_students') }}
4+
)
5+
select
6+
activity_date as "date",
7+
user_type_merged,
8+
country,
9+
us_intl,
10+
count(distinct(user_id)) num_actives
11+
from actives
12+
{{dbt_utils.group_by(4)}}
13+
order by 1,2
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
/*
2+
Number of active students per month
3+
*/
4+
with actives as (
5+
select *
6+
from {{ ref('dim_active_students') }}
7+
)
8+
select
9+
date_trunc('month', activity_date)::date as "month_year",
10+
user_type_merged,
11+
country,
12+
us_intl,
13+
count(distinct(user_id)) num_actives
14+
from actives
15+
{{dbt_utils.group_by(4)}}
16+
order by 1,2

dbt/models/marts/students/_students__models.yml

Lines changed: 106 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,124 @@
11
version: 2
22

3-
models:
3+
models:
4+
- name: dim_active_students
5+
description: |
6+
This model serves as a **prototype** for the active student metric. (Please update this description as the metric is improved/modified.)
7+
8+
The model aggregates student activities _per user, per day_ across**:
9+
- `user_levels`
10+
- `sign-ins`
11+
- `projects`
12+
13+
** the timestamps used for `user_level` and `projects` are the created_at timestamps right now. A better approach (in theory) would be to
14+
use updated_at, but we would need to start logging those daily.
15+
16+
columns:
17+
- name: activity_date
18+
description: "The date on which activity was recorded for the user, merged across all aggregated activity types."
19+
tests:
20+
- not_null
21+
data_type: "date"
22+
23+
- name: user_id
24+
description: "The Code.org user id for a user, consolidated from any activity across user levels, sign-ins, and projects data."
25+
tests:
26+
- not_null
27+
data_type: "varchar"
28+
29+
- name: user_type_merged
30+
description: "The type of user (e.g., student, teacher), sourced from the Code.org user profile."
31+
data_type: "varchar"
32+
33+
- name: country
34+
description: "The country of the user, derived from Code.org user-geographical data linked to the user_id. The country is volitile - it maps to the user's _current_ geographic lociation, not necessarily their location at the time of the event, and it can change over time."
35+
data_type: "varchar"
36+
37+
- name: us_intl
38+
description: "Classifies the `country` as U.S. or international (`us` or `intl`). International really means non-U.S."
39+
data_type: "varchar"
40+
41+
- name: school_year
42+
description: "The school year during which the activity occurred, determined by matching the activity date with school year ranges. A school year is defined as the 365 days between July 1 in year 1, and June 30 in year 2."
43+
data_type: "varchar"
44+
45+
- name: calendar_year
46+
description: "The calendar year extracted from the activity date, used for additional temporal analysis."
47+
data_type: "integer"
48+
49+
- name: num_user_level_records
50+
description: "The number of new user level records created for the user on the given day."
51+
data_type: "integer"
52+
53+
- name: num_project_records
54+
description: "The number of new projects created (project records) associated with the user for the given day."
55+
data_type: "integer"
56+
57+
- name: num_sign_ins
58+
description: "The number of sign-in records for the user on the given day."
59+
data_type: "integer"
60+
61+
- name: has_sign_in_activity
62+
description: "`0|1` flag indicating whether there was any sign-in activity for the user on the given day (1 for yes, 0 for no)."
63+
data_type: "integer"
64+
65+
- name: has_user_level_activity
66+
description: "`0|1` flag indicating whether there was any user level activity for the user on the given day (1 for yes, 0 for no)."
67+
data_type: "integer"
68+
69+
- name: has_project_activity
70+
description: "`0|1` flag indicating whether there was any project activity for the user on the given day (1 for yes, 0 for no)."
71+
data_type: "integer"
72+
73+
- name: activity_type
74+
description: "A 3-character long string 'encoded' to represent the three possible types of activity present for the user on the given day: 'S' for sign-ins, 'L' for user levels, 'P' for projects, in that order; '_' represents absence of the respective activity. For example: `SL_` means that on this day the user has user_level activity and a sign-in record, but no project created. `__P` means only a project was created on this day for this user. This field reflects in a single sring the 0|1 values from the `has_[event]_activity` fields."
75+
data_type: "varchar"
76+
77+
tests:
78+
- dbt_utils.unique_combination_of_columns:
79+
combination_of_columns:
80+
- user_id
81+
- activity_date
482

583
- name: dim_students
684
description: This model contains all student accounts ever created + geographic info on the student
7-
columns:
8-
- name: user_id
9-
description: unique ID for individual student user
10-
tests:
11-
- not_null
12-
- unique
85+
86+
columns:
87+
- name: user_id
88+
description: unique ID for individual student user
89+
tests:
90+
- not_null
91+
- unique
92+
1393

1494
# - name: user_type
1595
# description: always "student"- used for verification
1696
# tests:
1797
# - not_null
1898

19-
- name: created_at_school_year
20-
description: the school year in which the student account was created
21-
22-
- name: gender
23-
description: the classified gender of a student based on their free-response input
24-
25-
- name: is_urg
26-
description: flag to determine whether a student selected at least one of the following races- black, hispanic, hawaiian/ pacific islander, native american
27-
28-
- name: is_international
29-
description: 1 if last activity was outside the US, 0 otherwise
99+
- name: created_at_school_year
100+
description: the school year in which the student account was created
101+
102+
- name: gender
103+
description: the classified gender of a student based on their free-response input
104+
105+
- name: is_urg
106+
description: flag to determine whether a student selected at least one of the following races- black, hispanic, hawaiian/ pacific islander, native american
107+
108+
- name: is_international
109+
description: 1 if last activity was outside the US, 0 otherwise
110+
111+
- name: school_id
112+
description: school association of student
30113

31-
- name: school_id
32-
description: school association of student
114+
# - name: user_type
115+
# description: always "student"- used for verification
116+
# tests:
117+
# - not_null
33118

34119
# - name: dim_student_status
35120
# description: This model categorizes students based on their activity status across different school years. It provides insights into user engagement by assigning a status that reflects their activity in the current, previous, and any earlier school years.
36-
# columns:
121+
# columns:
37122
# - name: student_id
38123
# description: the unique ID associated with the activity
39124
# - name: school_year

0 commit comments

Comments
 (0)