Skip to content

Add SLoC (Source Lines of Code) metric to versions #11453

New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
434 changes: 432 additions & 2 deletions Cargo.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions crates/crates_io_database/src/models/version.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ pub struct Version {
pub homepage: Option<String>,
pub documentation: Option<String>,
pub repository: Option<String>,
pub linecounts: Option<serde_json::Value>,
}

impl Version {
Expand Down Expand Up @@ -103,6 +104,7 @@ pub struct NewVersion<'a> {
repository: Option<&'a str>,
categories: Option<&'a [&'a str]>,
keywords: Option<&'a [&'a str]>,
linecounts: Option<serde_json::Value>,
}

impl NewVersion<'_> {
Expand Down
2 changes: 2 additions & 0 deletions crates/crates_io_database/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1077,6 +1077,8 @@ diesel::table! {
keywords -> Array<Nullable<Text>>,
/// JSONB representation of the version number for sorting purposes.
semver_ord -> Nullable<Jsonb>,
/// Source Lines of Code statistics for this version, stored as JSON with language breakdown and totals.
linecounts -> Nullable<Jsonb>,
}
}

Expand Down
2 changes: 2 additions & 0 deletions crates/crates_io_database_dump/src/dump-db.toml
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,8 @@ documentation = "public"
repository = "public"
categories = "public"
keywords = "public"
# The following column is private for now, until we can guarantee a stable data schema.
linecounts = "private"

[versions_published_by.columns]
version_id = "private"
Expand Down
17 changes: 17 additions & 0 deletions crates/crates_io_linecount/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
[package]
name = "crates_io_linecount"
version = "0.0.0"
description = "Lines of code counting for crates.io using tokei"
license = "MIT OR Apache-2.0"
edition = "2024"

[lints]
workspace = true

[dependencies]
serde = { version = "=1.0.219", features = ["derive"] }
tokei = "=13.0.0-alpha.8"

[dev-dependencies]
claims = "=0.8.0"
insta = { version = "=1.43.1", features = ["json"] }
232 changes: 232 additions & 0 deletions crates/crates_io_linecount/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::path::Path;
use std::sync::LazyLock;
use tokei::Config;

// Re-export LanguageType for use by other crates
pub use tokei::LanguageType;

/// Tokei configuration used for analysis (cached)
static TOKEI_CONFIG: LazyLock<Config> = LazyLock::new(|| Config {
no_ignore: Some(true),
treat_doc_strings_as_comments: Some(true),
..Default::default()
});

/// Statistics for a single programming language
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
pub struct LanguageStats {
/// Number of lines of code (excluding comments and blank lines)
pub code_lines: usize,
/// Number of comment lines
pub comment_lines: usize,
/// Number of files of this language
pub files: usize,
}

/// Complete line count statistics for a crate
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
pub struct LinecountStats {
/// Per-language breakdown of line counts
pub languages: HashMap<LanguageType, LanguageStats>,
/// Total lines of code across all languages
pub total_code_lines: usize,
/// Total comment lines across all languages
pub total_comment_lines: usize,
}

impl LinecountStats {
/// Create a new empty statistics collection
pub fn new() -> Self {
Self::default()
}

/// Add a single file to the statistics
///
/// The caller can use `should_count_path()` to check if a file should be processed
/// before decompressing to avoid unnecessary work.
pub fn add_file(&mut self, language_type: LanguageType, content: &[u8]) {
let file_stats = language_type.parse_from_slice(content, &TOKEI_CONFIG);

// Update language-specific stats
let entry = self.languages.entry(language_type).or_default();
entry.code_lines += file_stats.code;
entry.comment_lines += file_stats.comments;
entry.files += 1;

// Update totals
self.total_code_lines += file_stats.code;
self.total_comment_lines += file_stats.comments;
}
}

/// Check if a path should be counted and return its language type
///
/// Returns `Some(LanguageType)` if the file should be analyzed, `None` otherwise.
pub fn should_count_path(path: &Path) -> Option<LanguageType> {
let path_str = path.to_string_lossy().to_lowercase();

// Skip test and example directories
if path_str.contains("tests/")
|| path_str.contains("test/")
|| path_str.contains("testing/")
|| path_str.contains("examples/")
|| path_str.contains("benches/")
|| path_str.contains("benchmark/")
{
return None;
}

// Skip hidden files
if let Some(filename) = path.file_name() {
if filename.to_string_lossy().starts_with('.') {
return None;
}
}

// Get language type from file extension
let extension = path.extension().and_then(|ext| ext.to_str())?;
let language_type = LanguageType::from_file_extension(extension)?;

// Only count if it's a programming language
is_countable_language(language_type).then_some(language_type)
}

/// Determine if a language should be counted
fn is_countable_language(lang: LanguageType) -> bool {
!matches!(
lang,
// Configuration and data files
LanguageType::Json |
LanguageType::Yaml |
LanguageType::Toml |
LanguageType::Xml |
LanguageType::Ini |

// Documentation
LanguageType::Markdown |
LanguageType::Text |
LanguageType::ReStructuredText |
LanguageType::AsciiDoc |
LanguageType::Org |

// Build system files
LanguageType::Makefile |
LanguageType::CMake |
LanguageType::Dockerfile |
LanguageType::Autoconf |
LanguageType::MsBuild |
LanguageType::Meson |
LanguageType::Scons |
LanguageType::Bazel |
LanguageType::Nix |

// Shell scripts (debatable, but often just build/deploy automation)
LanguageType::Batch |
LanguageType::PowerShell |

// Other non-programming files
LanguageType::Svg |
LanguageType::Hex |
LanguageType::Protobuf |
LanguageType::Thrift
)
}

#[cfg(test)]
mod tests {
use super::*;
use claims::{assert_none, assert_some};

#[test]
fn test_empty() {
let stats = LinecountStats::new();
insta::assert_json_snapshot!(stats, @r#"
{
"languages": {},
"total_code_lines": 0,
"total_comment_lines": 0
}
"#);
}

#[test]
fn test_add_file() {
let mut stats = LinecountStats::new();

// Add a Rust file
let rust_code = b"// This is a comment\nfn main() {\n println!(\"Hello\");\n}";
stats.add_file(LanguageType::Rust, rust_code);

insta::assert_json_snapshot!(stats, @r#"
{
"languages": {
"Rust": {
"code_lines": 3,
"comment_lines": 1,
"files": 1
}
},
"total_code_lines": 3,
"total_comment_lines": 1
}
"#);
}

#[test]
fn test_workflow() {
let mut stats = LinecountStats::new();

let files = [
("src/lib.rs", "pub fn hello() {}"),
("tests/test.rs", "fn test() {}"), // Should be skipped
("README.md", "# Hello"), // Should be skipped
];

for (path, content) in files {
let path = Path::new(path);
if let Some(language_type) = should_count_path(path) {
stats.add_file(language_type, content.as_bytes());
}
}

insta::assert_json_snapshot!(stats, @r#"
{
"languages": {
"Rust": {
"code_lines": 1,
"comment_lines": 0,
"files": 1
}
},
"total_code_lines": 1,
"total_comment_lines": 0
}
"#);
}

#[test]
fn test_should_count_path() {
assert_none!(should_count_path(Path::new("src/tests/mod.rs")));
assert_none!(should_count_path(Path::new("tests/integration.rs")));
assert_none!(should_count_path(Path::new("examples/basic.rs")));
assert_none!(should_count_path(Path::new("benches/bench.rs")));
assert_some!(should_count_path(Path::new("src/lib.rs")));
}

#[test]
fn test_language_filtering() {
// Should count programming languages
assert!(is_countable_language(LanguageType::Rust));
assert!(is_countable_language(LanguageType::JavaScript));
assert!(is_countable_language(LanguageType::Html));
assert!(is_countable_language(LanguageType::Css));

// Should skip config/data files
assert!(!is_countable_language(LanguageType::Json));
assert!(!is_countable_language(LanguageType::Yaml));
assert!(!is_countable_language(LanguageType::Toml));
assert!(!is_countable_language(LanguageType::Markdown));
}
}
1 change: 1 addition & 0 deletions crates/crates_io_tarball/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ builder = ["dep:flate2", "dep:tar"]
[dependencies]
astral-tokio-tar = "=0.5.2"
cargo-manifest = "=0.19.1"
crates_io_linecount = { path = "../crates_io_linecount" }
flate2 = { version = "=1.1.2", optional = true }
serde = { version = "=1.0.219", features = ["derive"] }
serde_json = "=1.0.140"
Expand Down
19 changes: 18 additions & 1 deletion crates/crates_io_tarball/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ const DEFAULT_BUF_SIZE: usize = 128 * 1024;
pub struct TarballInfo {
pub manifest: Manifest,
pub vcs_info: Option<CargoVcsInfo>,
pub linecount_stats: crates_io_linecount::LinecountStats,
}

#[derive(Debug, thiserror::Error)]
Expand Down Expand Up @@ -74,6 +75,7 @@ pub async fn process_tarball<R: tokio::io::AsyncRead + Unpin>(
let mut vcs_info = None;
let mut paths = Vec::new();
let mut manifests = BTreeMap::new();
let mut linecount_stats = crates_io_linecount::LinecountStats::new();
let mut entries = archive.entries()?;

while let Some(entry) = entries.next().await {
Expand Down Expand Up @@ -103,6 +105,12 @@ pub async fn process_tarball<R: tokio::io::AsyncRead + Unpin>(

paths.push(in_pkg_path.to_path_buf());

// Check if this file should be counted for line statistics
let is_file = entry_type.is_file();
let language_type_for_counting = is_file
.then(|| crates_io_linecount::should_count_path(in_pkg_path))
.flatten();

// Let's go hunting for the VCS info and crate manifest. The only valid place for these is
// in the package root in the tarball.
let in_pkg_path_str = in_pkg_path.to_string_lossy();
Expand All @@ -121,6 +129,11 @@ pub async fn process_tarball<R: tokio::io::AsyncRead + Unpin>(
validate_manifest(&manifest)?;

manifests.insert(owned_entry_path, manifest);
} else if let Some(language_type) = language_type_for_counting {
// If this is a file that we want to count, read it and update the line count stats.
let mut contents = Vec::new();
entry.read_to_end(&mut contents).await?;
linecount_stats.add_file(language_type, &contents);
}
}

Expand All @@ -146,7 +159,11 @@ pub async fn process_tarball<R: tokio::io::AsyncRead + Unpin>(

manifest.complete_from_abstract_filesystem(&PathsFileSystem(paths))?;

Ok(TarballInfo { manifest, vcs_info })
Ok(TarballInfo {
manifest,
vcs_info,
linecount_stats,
})
}

struct PathsFileSystem(Vec<PathBuf>);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,4 +76,15 @@ TarballInfo {
badges: None,
},
vcs_info: None,
linecount_stats: LinecountStats {
languages: {
Rust: LanguageStats {
code_lines: 1,
comment_lines: 0,
files: 1,
},
},
total_code_lines: 1,
total_comment_lines: 0,
},
}
Original file line number Diff line number Diff line change
Expand Up @@ -80,4 +80,15 @@ TarballInfo {
badges: None,
},
vcs_info: None,
linecount_stats: LinecountStats {
languages: {
Rust: LanguageStats {
code_lines: 1,
comment_lines: 0,
files: 1,
},
},
total_code_lines: 1,
total_comment_lines: 0,
},
}
Original file line number Diff line number Diff line change
Expand Up @@ -140,4 +140,15 @@ TarballInfo {
badges: None,
},
vcs_info: None,
linecount_stats: LinecountStats {
languages: {
Rust: LanguageStats {
code_lines: 3,
comment_lines: 0,
files: 3,
},
},
total_code_lines: 3,
total_comment_lines: 0,
},
}
Loading