Skip to content

Commit a33aba5

Browse files
committed
feat: json output on --json flag
1 parent 14116aa commit a33aba5

File tree

5 files changed

+90
-7
lines changed

5 files changed

+90
-7
lines changed

README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,23 @@ Save the generated prompt to an output file:
125125
code2prompt path/to/codebase --output=output.txt
126126
```
127127

128+
Print output as JSON:
129+
130+
```sh
131+
code2prompt path/to/codebase --json
132+
```
133+
134+
The JSON output will have the following structure:
135+
136+
```json
137+
{
138+
"directory_name": "codebase",
139+
"token_count": 1234,
140+
"model_info": "ChatGPT models, text-embedding-ada-002",
141+
"files": [...]
142+
}
143+
```
144+
128145
Generate a Git commit message (for staged files):
129146

130147
```sh

src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,4 @@ pub use path::{label, traverse_directory};
1010
pub use template::{
1111
copy_to_clipboard, handle_undefined_variables, handlebars_setup, render_template, write_to_file,
1212
};
13-
pub use token::count_tokens;
13+
pub use token::{count_tokens, get_model_info, get_tokenizer};

src/main.rs

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
use anyhow::{Context, Result};
77
use clap::Parser;
88
use code2prompt::{
9-
copy_to_clipboard, count_tokens, get_git_diff, handle_undefined_variables, handlebars_setup,
9+
copy_to_clipboard, get_model_info, get_tokenizer, get_git_diff, handle_undefined_variables, handlebars_setup,
1010
label, render_template, traverse_directory, write_to_file,
1111
};
1212
use colored::*;
@@ -76,6 +76,10 @@ struct Cli {
7676
/// Optional Path to a custom Handlebars template
7777
#[clap(short, long)]
7878
template: Option<PathBuf>,
79+
80+
/// Print output as JSON
81+
#[clap(long)]
82+
json: bool,
7983
}
8084

8185
fn main() -> Result<()> {
@@ -128,7 +132,7 @@ fn main() -> Result<()> {
128132
};
129133

130134
spinner.finish_with_message("Done!".green().to_string());
131-
135+
132136
// Prepare JSON Data
133137
let mut data = json!({
134138
"absolute_code_path": label(&args.path),
@@ -149,8 +153,26 @@ fn main() -> Result<()> {
149153
let rendered = render_template(&handlebars, template_name, &data)?;
150154

151155
// Display Token Count
152-
if args.tokens {
153-
count_tokens(&rendered, &args.encoding);
156+
let token_count = if args.tokens {
157+
let bpe = get_tokenizer(&args.encoding);
158+
bpe.encode_with_special_tokens(&rendered).len()
159+
} else {
160+
0
161+
};
162+
163+
let paths: Vec<String> = files.iter()
164+
.filter_map(|file| file.get("path").and_then(|p| p.as_str()).map(|s| s.to_string()))
165+
.collect();
166+
167+
if args.json {
168+
let json_output = json!({
169+
"directory_name": label(&args.path),
170+
"token_count": token_count,
171+
"model_info": get_model_info(&args.encoding),
172+
"files": paths,
173+
});
174+
println!("{}", serde_json::to_string_pretty(&json_output)?);
175+
return Ok(());
154176
}
155177

156178
// Copy to Clipboard

src/path.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,11 @@ pub fn traverse_directory(
116116
pub fn label<P: AsRef<Path>>(p: P) -> String {
117117
let path = p.as_ref();
118118
if path.file_name().is_none() {
119-
path.to_str().unwrap_or(".").to_owned()
119+
let current_dir = std::env::current_dir().unwrap();
120+
current_dir.file_name()
121+
.and_then(|name| name.to_str())
122+
.unwrap_or(".")
123+
.to_owned()
120124
} else {
121125
path.file_name()
122126
.and_then(|name| name.to_str())

src/token.rs

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,47 @@
11
//! This module encapsulates the logic for counting the tokens in the rendered text.
22
33
use colored::*;
4-
use tiktoken_rs::{cl100k_base, p50k_base, p50k_edit, r50k_base};
4+
use tiktoken_rs::{cl100k_base, p50k_base, p50k_edit, r50k_base, CoreBPE};
5+
6+
/// Returns the appropriate tokenizer based on the provided encoding.
7+
///
8+
/// # Arguments
9+
///
10+
/// * `encoding` - An optional string specifying the encoding to use for tokenization.
11+
/// Supported encodings: "cl100k" (default), "p50k", "p50k_edit", "r50k", "gpt2".
12+
///
13+
/// # Returns
14+
///
15+
/// * `CoreBPE` - The tokenizer corresponding to the specified encoding.
16+
pub fn get_tokenizer(encoding: &Option<String>) -> CoreBPE {
17+
match encoding.as_deref().unwrap_or("cl100k") {
18+
"cl100k" => cl100k_base().unwrap(),
19+
"p50k" => p50k_base().unwrap(),
20+
"p50k_edit" => p50k_edit().unwrap(),
21+
"r50k" | "gpt2" => r50k_base().unwrap(),
22+
_ => cl100k_base().unwrap(),
23+
}
24+
}
25+
26+
/// Returns the model information based on the provided encoding.
27+
///
28+
/// # Arguments
29+
///
30+
/// * `encoding` - An optional string specifying the encoding to use for retrieving model information.
31+
/// Supported encodings: "cl100k" (default), "p50k", "p50k_edit", "r50k", "gpt2".
32+
///
33+
/// # Returns
34+
///
35+
/// * `&'static str` - A string describing the models associated with the specified encoding.
36+
pub fn get_model_info(encoding: &Option<String>) -> &'static str {
37+
match encoding.as_deref().unwrap_or("cl100k") {
38+
"cl100k" => "ChatGPT models, text-embedding-ada-002",
39+
"p50k" => "Code models, text-davinci-002, text-davinci-003",
40+
"p50k_edit" => "Edit models like text-davinci-edit-001, code-davinci-edit-001",
41+
"r50k" | "gpt2" => "GPT-3 models like davinci",
42+
_ => "ChatGPT models, text-embedding-ada-002",
43+
}
44+
}
545

646
/// Counts the tokens in the rendered text using the specified encoding and prints the result.
747
///

0 commit comments

Comments
 (0)