Skip to content

Commit ea74317

Browse files
authored
Merge pull request #8781 from yoff/python-dataflow/flow-summaries-from-scratch
Python dataflow: flow summaries restart
2 parents 805aa94 + 318e329 commit ea74317

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+2694
-167
lines changed

config/identical-files.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,10 +73,11 @@
7373
"ruby/ql/lib/codeql/ruby/dataflow/internal/DataFlowImplConsistency.qll",
7474
"swift/ql/lib/codeql/swift/dataflow/internal/DataFlowImplConsistency.qll"
7575
],
76-
"DataFlow Java/C# Flow Summaries": [
76+
"DataFlow Java/C#/Ruby/Python/Swift Flow Summaries": [
7777
"java/ql/lib/semmle/code/java/dataflow/internal/FlowSummaryImpl.qll",
7878
"csharp/ql/lib/semmle/code/csharp/dataflow/internal/FlowSummaryImpl.qll",
7979
"ruby/ql/lib/codeql/ruby/dataflow/internal/FlowSummaryImpl.qll",
80+
"python/ql/lib/semmle/python/dataflow/new/internal/FlowSummaryImpl.qll",
8081
"swift/ql/lib/codeql/swift/dataflow/internal/FlowSummaryImpl.qll"
8182
],
8283
"SsaReadPosition Java/C#": [
@@ -532,7 +533,7 @@
532533
"java/ql/lib/semmle/code/java/dataflow/internal/AccessPathSyntax.qll",
533534
"javascript/ql/lib/semmle/javascript/frameworks/data/internal/AccessPathSyntax.qll",
534535
"ruby/ql/lib/codeql/ruby/dataflow/internal/AccessPathSyntax.qll",
535-
"python/ql/lib/semmle/python/frameworks/data/internal/AccessPathSyntax.qll",
536+
"python/ql/lib/semmle/python/dataflow/new/internal/AccessPathSyntax.qll",
536537
"swift/ql/lib/codeql/swift/dataflow/internal/AccessPathSyntax.qll"
537538
],
538539
"IncompleteUrlSubstringSanitization": [

python/ql/lib/design.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# The Python libraries
2+
3+
The Python libraries are a collection of libraries for analysing Python code.
4+
Everythng can be imported by importing `python.qll`.
5+
6+
## The analysis layers
7+
8+
The analysis is built up in layers. the stack looks like this:
9+
10+
- AST (coms from the extractor)
11+
- Control flow graph (CFG) (built by the extractor)
12+
- SSA
13+
- Call graph
14+
- Data flow
15+
16+
## Avoiding non-monotonic recursion
17+
18+
Given the many interactivg layers, it is imprtant to decie which predicates are allowed to be mutually recursive in order to avoid non-monotonic recursion when negation is used to express the predicates.
19+
As an example, we have defined local source as those whcih do not receive local flow. This means that the local flow relation is not allowed to be recursive with anything depending on local sources.
20+
21+
Some particular reatrictions to keep in mind:
22+
23+
- Typetracking needs to use a local flow step not including summaries
24+
- Typetracking needs to use a call graph not including summaries

python/ql/lib/semmle/python/ApiGraphs.qll

Lines changed: 56 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,21 @@ module API {
380380
not m.matches("%.%")
381381
}
382382

383+
/**
384+
* Holds if an import of module `m` exists.
385+
*
386+
* This is determined without referring to `Node`,
387+
* allowing this predicate to be used in a negative
388+
* context when constructing new nodes.
389+
*/
390+
predicate moduleImportExists(string m) {
391+
Impl::isImported(m) and
392+
// restrict `moduleImport` so it will never give results for a dotted name. Note
393+
// that we cannot move this logic to the `MkModuleImport` construction, since we
394+
// need the intermediate API graph nodes for the prefixes in `import foo.bar.baz`.
395+
not m.matches("%.%")
396+
}
397+
383398
/** Gets a node corresponding to the built-in with the given name, if any. */
384399
Node builtin(string n) { result = moduleImport("builtins").getMember(n) }
385400

@@ -605,14 +620,38 @@ module API {
605620
*
606621
* Ignores relative imports, such as `from ..foo.bar import baz`.
607622
*/
608-
private predicate imports(DataFlow::Node imp, string name) {
623+
private predicate imports(DataFlow::CfgNode imp, string name) {
609624
exists(PY::ImportExprNode iexpr |
610-
imp.asCfgNode() = iexpr and
625+
imp.getNode() = iexpr and
611626
not iexpr.getNode().isRelative() and
612627
name = iexpr.getNode().getImportedModuleName()
613628
)
614629
}
615630

631+
/**
632+
* Holds if the module `name` is imported.
633+
*
634+
* This is determined syntactically.
635+
*/
636+
cached
637+
predicate isImported(string name) {
638+
// Ignore the following module name for Python 2, as we alias `__builtin__` to `builtins` elsewhere
639+
(name != "__builtin__" or PY::major_version() = 3) and
640+
(
641+
exists(PY::ImportExpr iexpr |
642+
not iexpr.isRelative() and
643+
name = iexpr.getImportedModuleName()
644+
)
645+
or
646+
// When we `import foo.bar.baz` we want to create API graph nodes also for the prefixes
647+
// `foo` and `foo.bar`:
648+
name = any(PY::ImportExpr e | not e.isRelative()).getAnImportedModuleName()
649+
)
650+
or
651+
// The `builtins` module should always be implicitly available
652+
name = "builtins"
653+
}
654+
616655
private import semmle.python.dataflow.new.internal.Builtins
617656
private import semmle.python.dataflow.new.internal.ImportStar
618657

@@ -631,7 +670,7 @@ module API {
631670
*/
632671
private TApiNode potential_import_star_base(PY::Scope s) {
633672
exists(DataFlow::Node n |
634-
n.asCfgNode() = ImportStar::potentialImportStarBase(s) and
673+
n.(DataFlow::CfgNode).getNode() = ImportStar::potentialImportStarBase(s) and
635674
use(result, n)
636675
)
637676
}
@@ -653,17 +692,17 @@ module API {
653692
or
654693
// TODO: I had expected `DataFlow::AttrWrite` to contain the attribute writes from a dict, that's how JS works.
655694
exists(PY::Dict dict, PY::KeyValuePair item |
656-
dict = pred.asExpr() and
695+
dict = pred.(DataFlow::ExprNode).getNode().getNode() and
657696
dict.getItem(_) = item and
658697
lbl = Label::member(item.getKey().(PY::StrConst).getS()) and
659-
rhs.asExpr() = item.getValue()
698+
rhs.(DataFlow::ExprNode).getNode().getNode() = item.getValue()
660699
)
661700
or
662-
exists(PY::CallableExpr fn | fn = pred.asExpr() |
701+
exists(PY::CallableExpr fn | fn = pred.(DataFlow::ExprNode).getNode().getNode() |
663702
not fn.getInnerScope().isAsync() and
664703
lbl = Label::return() and
665704
exists(PY::Return ret |
666-
rhs.asExpr() = ret.getValue() and
705+
rhs.(DataFlow::ExprNode).getNode().getNode() = ret.getValue() and
667706
ret.getScope() = fn.getInnerScope()
668707
)
669708
)
@@ -716,9 +755,9 @@ module API {
716755
// "benign" and let subclasses edges flow through anyway.
717756
// see example in https://github.com/django/django/blob/c2250cfb80e27cdf8d098428824da2800a18cadf/tests/auth_tests/test_views.py#L40-L46
718757
(
719-
ref.asExpr() = clsExpr
758+
ref.(DataFlow::ExprNode).getNode().getNode() = clsExpr
720759
or
721-
ref.asExpr() = clsExpr.getADecoratorCall()
760+
ref.(DataFlow::ExprNode).getNode().getNode() = clsExpr.getADecoratorCall()
722761
)
723762
)
724763
or
@@ -731,26 +770,27 @@ module API {
731770
)
732771
or
733772
exists(DataFlow::Node def, PY::CallableExpr fn |
734-
rhs(base, def) and fn = trackDefNode(def).asExpr()
773+
rhs(base, def) and fn = trackDefNode(def).(DataFlow::ExprNode).getNode().getNode()
735774
|
736775
exists(int i, int offset |
737776
if exists(PY::Parameter p | p = fn.getInnerScope().getAnArg() and p.isSelf())
738777
then offset = 1
739778
else offset = 0
740779
|
741780
lbl = Label::parameter(i - offset) and
742-
ref.asExpr() = fn.getInnerScope().getArg(i)
781+
ref.(DataFlow::ExprNode).getNode().getNode() = fn.getInnerScope().getArg(i)
743782
)
744783
or
745784
exists(string name, PY::Parameter param |
746785
lbl = Label::keywordParameter(name) and
747786
param = fn.getInnerScope().getArgByName(name) and
748787
not param.isSelf() and
749-
ref.asExpr() = param
788+
ref.(DataFlow::ExprNode).getNode().getNode() = param
750789
)
751790
or
752791
lbl = Label::selfParameter() and
753-
ref.asExpr() = any(PY::Parameter p | p = fn.getInnerScope().getAnArg() and p.isSelf())
792+
ref.(DataFlow::ExprNode).getNode().getNode() =
793+
any(PY::Parameter p | p = fn.getInnerScope().getAnArg() and p.isSelf())
754794
)
755795
or
756796
// Built-ins, treated as members of the module `builtins`
@@ -762,7 +802,7 @@ module API {
762802
base = potential_import_star_base(s) and
763803
lbl =
764804
Label::member(any(string name |
765-
ImportStar::namePossiblyDefinedInImportStar(ref.asCfgNode(), name, s)
805+
ImportStar::namePossiblyDefinedInImportStar(ref.(DataFlow::CfgNode).getNode(), name, s)
766806
))
767807
)
768808
or
@@ -854,7 +894,7 @@ module API {
854894
DataFlow::LocalSourceNode trackUseNode(DataFlow::LocalSourceNode src) {
855895
Stages::TypeTracking::ref() and
856896
result = trackUseNode(src, DataFlow::TypeTracker::end()) and
857-
not result instanceof DataFlow::ModuleVariableNode
897+
result instanceof DataFlow::ExprNode
858898
}
859899

860900
/**
@@ -1044,7 +1084,7 @@ module API {
10441084
ApiLabel memberFromRef(DataFlow::AttrRef ref) {
10451085
result = member(ref.getAttributeName())
10461086
or
1047-
not exists(ref.getAttributeName()) and
1087+
ref.unknownAttribute() and
10481088
result = unknownMember()
10491089
}
10501090

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
/** Provides classes and predicates for defining flow summaries. */
2+
3+
private import python
4+
private import semmle.python.dataflow.new.DataFlow
5+
private import semmle.python.frameworks.data.ModelsAsData
6+
private import semmle.python.ApiGraphs
7+
private import internal.FlowSummaryImpl as Impl
8+
private import internal.DataFlowUtil
9+
private import internal.DataFlowPrivate
10+
11+
// import all instances below
12+
private module Summaries {
13+
private import semmle.python.Frameworks
14+
}
15+
16+
class SummaryComponent = Impl::Public::SummaryComponent;
17+
18+
/** Provides predicates for constructing summary components. */
19+
module SummaryComponent {
20+
private import Impl::Public::SummaryComponent as SC
21+
22+
predicate parameter = SC::parameter/1;
23+
24+
predicate argument = SC::argument/1;
25+
26+
predicate content = SC::content/1;
27+
28+
/** Gets a summary component that represents a list element. */
29+
SummaryComponent listElement() { result = content(any(ListElementContent c)) }
30+
31+
/** Gets a summary component that represents the return value of a call. */
32+
SummaryComponent return() { result = SC::return(any(ReturnKind rk)) }
33+
}
34+
35+
class SummaryComponentStack = Impl::Public::SummaryComponentStack;
36+
37+
/** Provides predicates for constructing stacks of summary components. */
38+
module SummaryComponentStack {
39+
private import Impl::Public::SummaryComponentStack as SCS
40+
41+
predicate singleton = SCS::singleton/1;
42+
43+
predicate push = SCS::push/2;
44+
45+
predicate argument = SCS::argument/1;
46+
47+
/** Gets a singleton stack representing the return value of a call. */
48+
SummaryComponentStack return() { result = singleton(SummaryComponent::return()) }
49+
}
50+
51+
/** A callable with a flow summary, identified by a unique string. */
52+
abstract class SummarizedCallable extends LibraryCallable, Impl::Public::SummarizedCallable {
53+
bindingset[this]
54+
SummarizedCallable() { any() }
55+
56+
/**
57+
* Same as
58+
*
59+
* ```ql
60+
* propagatesFlow(
61+
* SummaryComponentStack input, SummaryComponentStack output, boolean preservesValue
62+
* )
63+
* ```
64+
*
65+
* but uses an external (string) representation of the input and output stacks.
66+
*/
67+
pragma[nomagic]
68+
predicate propagatesFlowExt(string input, string output, boolean preservesValue) { none() }
69+
}
70+
71+
class RequiredSummaryComponentStack = Impl::Public::RequiredSummaryComponentStack;
72+
// // This gives access to getNodeFromPath, which is not constrained to `CallNode`s
73+
// // as `resolvedSummaryBase` is.
74+
// private import semmle.python.frameworks.data.internal.ApiGraphModels as AGM
75+
//
76+
// private class SummarizedCallableFromModel extends SummarizedCallable {
77+
// string package;
78+
// string type;
79+
// string path;
80+
// SummarizedCallableFromModel() {
81+
// ModelOutput::relevantSummaryModel(package, type, path, _, _, _) and
82+
// this = package + ";" + type + ";" + path
83+
// }
84+
// override CallCfgNode getACall() {
85+
// exists(API::CallNode base |
86+
// ModelOutput::resolvedSummaryBase(package, type, path, base) and
87+
// result = base.getACall()
88+
// )
89+
// }
90+
// override ArgumentNode getACallback() {
91+
// exists(API::Node base |
92+
// base = AGM::getNodeFromPath(package, type, path) and
93+
// result = base.getAValueReachableFromSource()
94+
// )
95+
// }
96+
// override predicate propagatesFlowExt(string input, string output, boolean preservesValue) {
97+
// exists(string kind |
98+
// ModelOutput::relevantSummaryModel(package, type, path, input, output, kind)
99+
// |
100+
// kind = "value" and
101+
// preservesValue = true
102+
// or
103+
// kind = "taint" and
104+
// preservesValue = false
105+
// )
106+
// }
107+
// }

python/ql/lib/semmle/python/dataflow/new/internal/Attributes.qll

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ abstract class AttrRef extends Node {
4040
or
4141
exists(LocalSourceNode nodeFrom |
4242
nodeFrom.flowsTo(this.getAttributeNameExpr()) and
43-
attrName = nodeFrom.asExpr().(StrConst).getText()
43+
attrName = nodeFrom.(CfgNode).getNode().getNode().(StrConst).getText()
4444
)
4545
}
4646

@@ -50,6 +50,9 @@ abstract class AttrRef extends Node {
5050
* better results.
5151
*/
5252
abstract string getAttributeName();
53+
54+
/** Holds if a name could not be determined for this attribute. */
55+
predicate unknownAttribute() { not exists(this.getAttributeName()) }
5356
}
5457

5558
/**
@@ -175,7 +178,7 @@ private class SetAttrCallAsAttrWrite extends AttrWrite, CfgNode {
175178
override ExprNode getAttributeNameExpr() { result.asCfgNode() = node.getName() }
176179

177180
override string getAttributeName() {
178-
result = this.getAttributeNameExpr().asExpr().(StrConst).getText()
181+
result = this.getAttributeNameExpr().(CfgNode).getNode().getNode().(StrConst).getText()
179182
}
180183
}
181184

@@ -251,7 +254,7 @@ private class GetAttrCallAsAttrRead extends AttrRead, CfgNode {
251254
override ExprNode getAttributeNameExpr() { result.asCfgNode() = node.getName() }
252255

253256
override string getAttributeName() {
254-
result = this.getAttributeNameExpr().asExpr().(StrConst).getText()
257+
result = this.getAttributeNameExpr().(CfgNode).getNode().getNode().(StrConst).getText()
255258
}
256259
}
257260

python/ql/lib/semmle/python/dataflow/new/internal/Builtins.qll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,9 @@ module Builtins {
6060
* Currently this is an over-approximation, and may not account for things like overwriting a
6161
* built-in with a different value.
6262
*/
63-
DataFlow::Node likelyBuiltin(string name) {
63+
DataFlow::CfgNode likelyBuiltin(string name) {
6464
exists(Module m |
65-
result.asCfgNode() =
65+
result.getNode() =
6666
any(NameNode n |
6767
possible_builtin_accessed_in_module(n, name, m) and
6868
not possible_builtin_defined_in_module(name, m)

0 commit comments

Comments
 (0)