feat: json output on --json flag

mufeedvh · mufeedvh · commit a33aba52371a · 2024-06-01T16:55:45.000+05:30
diff --git a/README.md b/README.md
@@ -125,6 +125,23 @@ Save the generated prompt to an output file:
 code2prompt path/to/codebase --output=output.txt
 ```
 
+Print output as JSON:
+
+```sh
+code2prompt path/to/codebase --json
+```
+
+The JSON output will have the following structure:
+
+```json
+{
+  "directory_name": "codebase",
+  "token_count": 1234,
+  "model_info": "ChatGPT models, text-embedding-ada-002",
+  "files": [...]
+}
+```
+
 Generate a Git commit message (for staged files):
 
 ```sh
diff --git a/src/lib.rs b/src/lib.rs
@@ -10,4 +10,4 @@ pub use path::{label, traverse_directory};
 pub use template::{
     copy_to_clipboard, handle_undefined_variables, handlebars_setup, render_template, write_to_file,
 };
-pub use token::count_tokens;
+pub use token::{count_tokens, get_model_info, get_tokenizer};
diff --git a/src/main.rs b/src/main.rs
@@ -6,7 +6,7 @@
 use anyhow::{Context, Result};
 use clap::Parser;
 use code2prompt::{
-    copy_to_clipboard, count_tokens, get_git_diff, handle_undefined_variables, handlebars_setup,
+    copy_to_clipboard, get_model_info, get_tokenizer, get_git_diff, handle_undefined_variables, handlebars_setup,
     label, render_template, traverse_directory, write_to_file,
 };
 use colored::*;
@@ -76,6 +76,10 @@ struct Cli {
     /// Optional Path to a custom Handlebars template
     #[clap(short, long)]
     template: Option<PathBuf>,
+
+    /// Print output as JSON
+    #[clap(long)]
+    json: bool,    
 }
 
 fn main() -> Result<()> {
@@ -128,7 +132,7 @@ fn main() -> Result<()> {
     };
 
     spinner.finish_with_message("Done!".green().to_string());
-
+    
     // Prepare JSON Data
     let mut data = json!({
         "absolute_code_path": label(&args.path),
@@ -149,8 +153,26 @@ fn main() -> Result<()> {
     let rendered = render_template(&handlebars, template_name, &data)?;
 
     // Display Token Count
-    if args.tokens {
-        count_tokens(&rendered, &args.encoding);
+    let token_count = if args.tokens {
+        let bpe = get_tokenizer(&args.encoding);
+        bpe.encode_with_special_tokens(&rendered).len()
+    } else {
+        0
+    };
+
+    let paths: Vec<String> = files.iter()
+        .filter_map(|file| file.get("path").and_then(|p| p.as_str()).map(|s| s.to_string()))
+        .collect();
+
+    if args.json {
+        let json_output = json!({
+            "directory_name": label(&args.path),
+            "token_count": token_count,
+            "model_info": get_model_info(&args.encoding),
+            "files": paths,
+        });
+        println!("{}", serde_json::to_string_pretty(&json_output)?);
+        return Ok(());
     }
 
     // Copy to Clipboard
diff --git a/src/path.rs b/src/path.rs
@@ -116,7 +116,11 @@ pub fn traverse_directory(
 pub fn label<P: AsRef<Path>>(p: P) -> String {
     let path = p.as_ref();
     if path.file_name().is_none() {
-        path.to_str().unwrap_or(".").to_owned()
+        let current_dir = std::env::current_dir().unwrap();
+        current_dir.file_name()
+            .and_then(|name| name.to_str())
+            .unwrap_or(".")
+            .to_owned()
     } else {
         path.file_name()
             .and_then(|name| name.to_str())
diff --git a/src/token.rs b/src/token.rs
@@ -1,7 +1,47 @@
 //! This module encapsulates the logic for counting the tokens in the rendered text.
 
 use colored::*;
-use tiktoken_rs::{cl100k_base, p50k_base, p50k_edit, r50k_base};
+use tiktoken_rs::{cl100k_base, p50k_base, p50k_edit, r50k_base, CoreBPE};
+
+/// Returns the appropriate tokenizer based on the provided encoding.
+///
+/// # Arguments
+///
+/// * `encoding` - An optional string specifying the encoding to use for tokenization.
+///                Supported encodings: "cl100k" (default), "p50k", "p50k_edit", "r50k", "gpt2".
+///
+/// # Returns
+///
+/// * `CoreBPE` - The tokenizer corresponding to the specified encoding.
+pub fn get_tokenizer(encoding: &Option<String>) -> CoreBPE {
+    match encoding.as_deref().unwrap_or("cl100k") {
+        "cl100k" => cl100k_base().unwrap(),
+        "p50k" => p50k_base().unwrap(),
+        "p50k_edit" => p50k_edit().unwrap(),
+        "r50k" | "gpt2" => r50k_base().unwrap(),
+        _ => cl100k_base().unwrap(),
+    }
+}
+
+/// Returns the model information based on the provided encoding.
+///
+/// # Arguments
+///
+/// * `encoding` - An optional string specifying the encoding to use for retrieving model information.
+///                Supported encodings: "cl100k" (default), "p50k", "p50k_edit", "r50k", "gpt2".
+///
+/// # Returns
+///
+/// * `&'static str` - A string describing the models associated with the specified encoding.
+pub fn get_model_info(encoding: &Option<String>) -> &'static str {
+    match encoding.as_deref().unwrap_or("cl100k") {
+        "cl100k" => "ChatGPT models, text-embedding-ada-002",
+        "p50k" => "Code models, text-davinci-002, text-davinci-003",
+        "p50k_edit" => "Edit models like text-davinci-edit-001, code-davinci-edit-001",
+        "r50k" | "gpt2" => "GPT-3 models like davinci",
+        _ => "ChatGPT models, text-embedding-ada-002",
+    }
+}
 
 /// Counts the tokens in the rendered text using the specified encoding and prints the result.
 ///