Skip to content

GCI98 PandasRequireUsecolsArgument #Python #DLG #Build #71

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added

- Add rule GCI98 Require Usecols Argument in Pandas Read Functions

### Changed

- compatibility updates for SonarQube 25.5.0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ public class PythonRuleRepository implements RulesDefinition, PythonCustomRuleRe
AvoidFullSQLRequest.class,
AvoidListComprehensionInIterations.class,
DetectUnoptimizedImageFormat.class,
AvoidMultipleIfElseStatementCheck.class
AvoidMultipleIfElseStatementCheck.class,
RequireUsecolsArgument.class
);

public static final String LANGUAGE = "py";
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/*
* creedengo - Python language - Provides rules to reduce the environmental footprint of your Python programs
* Copyright © 2024 Green Code Initiative (https://green-code-initiative.org)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.greencodeinitiative.creedengo.python.checks;

import java.util.Arrays;
import java.util.List;
import org.sonar.check.Rule;
import org.sonar.plugins.python.api.PythonSubscriptionCheck;
import org.sonar.plugins.python.api.SubscriptionContext;
import org.sonar.plugins.python.api.tree.Argument;
import org.sonar.plugins.python.api.tree.CallExpression;
import org.sonar.plugins.python.api.tree.Tree;
import org.sonar.plugins.python.api.tree.Expression;
import org.sonar.plugins.python.api.tree.QualifiedExpression;
import org.sonar.plugins.python.api.tree.RegularArgument;
import static org.sonar.plugins.python.api.tree.Tree.Kind.*;

@Rule(key = "GCI98")
public class RequireUsecolsArgument extends PythonSubscriptionCheck {

public static final String DESCRIPTION = "Specify 'usecols' or 'columns' when reading a DataFrame to load only necessary columns";
private static final List<String> READ_METHODS = Arrays.asList(
"read_csv", "read_parquet", "read_excel", "read_feather", "read_json"
);

@Override
public void initialize(Context context) {
context.registerSyntaxNodeConsumer(Tree.Kind.CALL_EXPR, this::visitCallExpression);
}

public void visitCallExpression(SubscriptionContext ctx) {
CallExpression callExpression = (CallExpression) ctx.syntaxNode();
Expression callee = callExpression.callee();

if (callee.is(Tree.Kind.QUALIFIED_EXPR)) {
QualifiedExpression qualifiedExpression = (QualifiedExpression) callee;
String methodName = qualifiedExpression.name().name();

if (READ_METHODS.contains(methodName)) {

if (!hasColumnsSpecified(callExpression)) {
ctx.addIssue(callExpression.firstToken(), DESCRIPTION);
}
}
}
}

private boolean hasColumnsSpecified(CallExpression callExpression) {
List<Argument> arguments = callExpression.arguments();

for (Argument arg : arguments) {
if (arg.is(REGULAR_ARGUMENT)) {
RegularArgument regularArg = (RegularArgument) arg;
String paramName = regularArg.keywordArgument() != null ? regularArg.keywordArgument().name() : null;
if (paramName != null && (paramName.equals("usecols") || paramName.equals("columns"))) {
return true;
}
}
}
return false;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
/*
* creedengo - Python language - Provides rules to reduce the environmental footprint of your Python programs
* Copyright © 2024 Green Code Initiative (https://green-code-initiative.org)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.greencodeinitiative.creedengo.python.checks;

import org.junit.Test;
import org.sonar.python.checks.utils.PythonCheckVerifier;

public class RequireUsecolsArgumentTest {

@Test
public void test() {
PythonCheckVerifier.verify("src/test/resources/checks/requireUsecolsArgument.py", new RequireUsecolsArgument());
}
}
23 changes: 23 additions & 0 deletions src/test/resources/checks/requireUsecolsArgument.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import pandas as pd

df1 = pd.read_csv('data.csv') # Noncompliant {{Specify 'usecols' when reading a DataFrame to load only necessary columns}}
df2 = pd.read_parquet('data.parquet') # Noncompliant {{Specify 'usecols' when reading a DataFrame to load only necessary columns}}
df3 = pd.read_excel('data.xlsx') # Noncompliant {{Specify 'usecols' when reading a DataFrame to load only necessary columns}}
df4 = pd.read_json('data.json') # Noncompliant {{Specify 'usecols' when reading a DataFrame to load only necessary columns}}
df5 = pd.read_feather('data.feather') # Noncompliant {{Specify 'usecols' when reading a DataFrame to load only necessary columns}}

df7 = pd.read_csv('data.csv', usecols=['col1', 'col2'])
df8 = pd.read_parquet('data.parquet', columns=['col1', 'col2'])
df9 = pd.read_excel('data.xlsx', usecols=[0, 1, 2])
df10 = pd.read_json('data.json', columns=['col1', 'col2'])
df11 = pd.read_feather('data.feather', columns=['col1', 'col2'])

import pandas as pandas_alias
df14 = pandas_alias.read_csv('data.csv') # Noncompliant {{Specify 'usecols' when reading a DataFrame to load only necessary columns}}
df15 = pandas_alias.read_csv('data.csv', usecols=['col1'])

df16 = pd.read_csv('data.csv', sep=',', header=0) # Noncompliant {{Specify 'usecols' when reading a DataFrame to load only necessary columns}}
df17 = pd.read_csv('data.csv', sep=',', header=0, usecols=['col1', 'col2'])

cols_to_use = ['col1', 'col2', 'col3']
df18 = pd.read_parquet('data.parquet', columns=cols_to_use)