Skip to content

Commit e63a690

Browse files
authored
Add strategy that was used to detect the file's language (#7405)
* Allow to know strategy used to detect language * Adjust require statements * Add tests
1 parent 652c13b commit e63a690

File tree

7 files changed

+161
-4
lines changed

7 files changed

+161
-4
lines changed

bin/github-linguist

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,14 @@ Linguist v#{Linguist::VERSION}
1313
Detect language type and determine language breakdown for a given Git repository.
1414
1515
Usage: github-linguist <path>
16-
github-linguist <path> [--rev REV] [--tree-size] [--breakdown] [--json]
17-
github-linguist [--rev REV] [--tree-size] [--breakdown] [--json]
16+
github-linguist <path> [--rev REV] [--tree-size] [--breakdown] [--strategies] [--json]
17+
github-linguist [--rev REV] [--tree-size] [--breakdown] [--strategies] [--json]
1818
HELP
1919

2020
def github_linguist(args)
2121
breakdown = false
2222
json_output = false
23+
show_strategies = false
2324
tree_size = Linguist::Repository::MAX_TREE_SIZE
2425
rev = 'HEAD'
2526
path = Dir.pwd
@@ -29,6 +30,7 @@ def github_linguist(args)
2930
opts.version = Linguist::VERSION
3031

3132
opts.on("-b", "--breakdown", "Analyze entire repository and display detailed usage statistics") { breakdown = true }
33+
opts.on("-s", "--strategies", "Show language detection strategy used for each file") { show_strategies = true }
3234
opts.on("-j", "--json", "Output results as JSON") { json_output = true }
3335
opts.on("-r", "--rev REV", String,
3436
"Analyze specific git revision",
@@ -59,6 +61,11 @@ def github_linguist(args)
5961
puts "invalid revision '#{rev}' for repo '#{path}'"
6062
exit 1
6163
end
64+
65+
# Set up instrumentation to track detection strategies if requested
66+
instrumenter = show_strategies ? Linguist::BasicInstrumenter.new : nil
67+
Linguist.instrumenter = instrumenter
68+
6269
repo = Linguist::Repository.new(rugged, target_oid, tree_size)
6370

6471
full_results = {}
@@ -72,13 +79,18 @@ def github_linguist(args)
7279
full_results.sort_by { |_, v| v[:size] }.reverse.each do |language, details|
7380
puts "%-7s %-10s %s" % ["#{details[:percentage]}%", details[:size], language]
7481
end
75-
if breakdown
82+
if breakdown || show_strategies
7683
puts
7784
file_breakdown = repo.breakdown_by_file
7885
file_breakdown.each do |lang, files|
7986
puts "#{lang}:"
8087
files.each do |file|
81-
puts file
88+
strategy_info = instrumenter&.detected_info&.[](file)
89+
if show_strategies && strategy_info
90+
puts " #{file} [#{strategy_info[:strategy]}]"
91+
else
92+
puts " #{file}"
93+
end
8294
end
8395
puts
8496
end
@@ -96,6 +108,9 @@ def github_linguist(args)
96108
end
97109
end
98110
elsif File.file?(path)
111+
# Set up instrumentation to track detection strategies if requested
112+
instrumenter = show_strategies ? Linguist::BasicInstrumenter.new : nil
113+
Linguist.instrumenter = instrumenter
99114

100115
begin
101116
# Check if this file is inside a git repository so we have things like
@@ -134,6 +149,10 @@ def github_linguist(args)
134149
puts " type: #{type}"
135150
puts " mime type: #{blob.mime_type}"
136151
puts " language: #{blob.language}"
152+
if show_strategies && blob.language
153+
strategy_info = instrumenter.detected_info[blob.name]
154+
puts " strategy: #{strategy_info[:strategy]}" if strategy_info
155+
end
137156

138157
if blob.large?
139158
puts " blob is too large to be shown"

lib/linguist.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
require 'linguist/version'
1010
require 'linguist/strategy/manpage'
1111
require 'linguist/strategy/xml'
12+
require 'linguist/instrumenter'
1213

1314
class << Linguist
1415
# Public: Detects the Language of the blob.

lib/linguist/instrumenter.rb

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
module Linguist
2+
class BasicInstrumenter
3+
attr_reader :detected_info
4+
5+
def initialize
6+
@detected_info = {}
7+
end
8+
9+
def instrument(name, payload = {})
10+
if name == "linguist.detected" && payload[:blob]
11+
@detected_info[payload[:blob].name] = {
12+
strategy: payload[:strategy].name.split("::").last,
13+
language: payload[:language]&.name
14+
}
15+
end
16+
yield if block_given?
17+
end
18+
end
19+
end

test/fixtures/Ruby/foo.rb

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# This file is used to test the Linguist language detection capabilities.
2+
# It should be detected as Ruby based on its extension and content.
3+
# The file is intentionally simple to ensure it does not contain complex logic.
4+
# You can add more Ruby code here if needed for further testing.
5+
# The purpose of this file is to serve as a fixture for testing the Linguist library.
6+
# It should not be executed in a production environment.
7+
# Ensure that this file is saved with the .rb extension to be recognized as Ruby code.
8+
9+
puts "This is a sample Ruby file for testing purposes."
10+
11+

test/fixtures/Shell/sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#!/bin/sh
2+
# This is a comment
3+
echo "Hello, World!" # Print a message to the console

test/fixtures/VBA/sample.bas

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
Attribute VB_Name = "Utilities"
2+
Option Explicit
3+
4+
Sub EditExcelRange()
5+
Dim ws As Worksheet
6+
Set ws = ThisWorkbook.Sheets("Sheet1")
7+
8+
' Edit a range in the worksheet
9+
ws.Range("A1").Value = "Hello, World!"
10+
ws.Range("A2").Value = 42
11+
ws.Range("A3").Value = Date
12+
13+
' Format the range
14+
With ws.Range("A1:A3")
15+
.Font.Bold = True
16+
.Interior.Color = RGB(255, 255, 0) ' Yellow background
17+
End With
18+
19+
MsgBox "Range edited successfully!"
20+
End Sub

test/test_basic_instrumenter.rb

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
require_relative "./helper"
2+
3+
class TestBasicInstrumenter < Minitest::Test
4+
include Linguist
5+
6+
def setup
7+
@instrumenter = Linguist::BasicInstrumenter.new
8+
Linguist.instrumenter = @instrumenter
9+
end
10+
11+
def teardown
12+
Linguist.instrumenter = nil
13+
end
14+
15+
def test_tracks_extension_strategy
16+
# Ruby file detected by extension
17+
blob = fixture_blob("Ruby/foo.rb")
18+
Linguist.detect(blob)
19+
20+
assert @instrumenter.detected_info.key?(blob.name)
21+
assert_equal "Extension", @instrumenter.detected_info[blob.name][:strategy]
22+
assert_equal "Ruby", @instrumenter.detected_info[blob.name][:language]
23+
end
24+
25+
def test_tracks_modeline_strategy
26+
# File with vim modeline
27+
blob = fixture_blob("Data/Modelines/ruby")
28+
Linguist.detect(blob)
29+
30+
assert @instrumenter.detected_info.key?(blob.name)
31+
assert_equal "Modeline", @instrumenter.detected_info[blob.name][:strategy]
32+
assert_equal "Ruby", @instrumenter.detected_info[blob.name][:language]
33+
end
34+
35+
def test_tracks_shebang_strategy
36+
# File with shebang
37+
blob = fixture_blob("Shell/sh")
38+
Linguist.detect(blob)
39+
40+
assert @instrumenter.detected_info.key?(blob.name)
41+
assert_equal "Shebang", @instrumenter.detected_info[blob.name][:strategy]
42+
assert_equal "Shell", @instrumenter.detected_info[blob.name][:language]
43+
end
44+
45+
def test_tracks_multiple_files
46+
# Track multiple files in sequence
47+
ruby_blob = fixture_blob("Ruby/foo.rb")
48+
shell_blob = fixture_blob("Shell/sh")
49+
50+
Linguist.detect(ruby_blob)
51+
Linguist.detect(shell_blob)
52+
53+
assert_equal 2, @instrumenter.detected_info.size
54+
assert @instrumenter.detected_info.key?(ruby_blob.name)
55+
assert @instrumenter.detected_info.key?(shell_blob.name)
56+
end
57+
58+
def test_no_tracking_for_binary_files
59+
binary_blob = fixture_blob("Binary/octocat.ai")
60+
Linguist.detect(binary_blob)
61+
62+
# Should not record info for binary files
63+
assert_equal 0, @instrumenter.detected_info.size
64+
end
65+
66+
def test_records_correct_strategy_for_heuristics
67+
# .bas file that should be detected via heuristics
68+
blob = fixture_blob("VBA/sample.bas")
69+
Linguist.detect(blob)
70+
71+
assert @instrumenter.detected_info.key?(blob.name)
72+
assert_equal "Heuristics", @instrumenter.detected_info[blob.name][:strategy]
73+
end
74+
75+
def test_tracks_filename_strategy
76+
# Dockerfile detected by filename
77+
blob = fixture_blob("Dockerfile/Dockerfile")
78+
Linguist.detect(blob)
79+
80+
assert @instrumenter.detected_info.key?(blob.name)
81+
assert_equal "Filename", @instrumenter.detected_info[blob.name][:strategy]
82+
assert_equal "Dockerfile", @instrumenter.detected_info[blob.name][:language]
83+
end
84+
end

0 commit comments

Comments
 (0)