Skip to content

Commit

Permalink
Rename findSymbols into lookupSymbolByName (#1772)
Browse files Browse the repository at this point in the history
* Rename `findSymbols` into `lookupSymbolByName`

This PR renames `findSymbols` into `lookupSymbolByName` as a more appropriate name, because it lookups a symbol by its name.

Fixes #1767

* Update cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/ScopeManager.kt

Co-authored-by: KuechA <31155350+KuechA@users.noreply.github.com>

* Added documentation

---------

Co-authored-by: KuechA <31155350+KuechA@users.noreply.github.com>
  • Loading branch information
oxisto and KuechA authored Oct 3, 2024
1 parent 3d07443 commit 51cb57e
Show file tree
Hide file tree
Showing 10 changed files with 159 additions and 22 deletions.
25 changes: 15 additions & 10 deletions cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/ScopeManager.kt
Original file line number Diff line number Diff line change
Expand Up @@ -899,7 +899,7 @@ class ScopeManager : ScopeProvider {
* @return the declaration, or null if it does not exist
*/
fun getRecordForName(name: Name): RecordDeclaration? {
return findSymbols(name).filterIsInstance<RecordDeclaration>().singleOrNull()
return lookupSymbolByName(name).filterIsInstance<RecordDeclaration>().singleOrNull()
}

fun typedefFor(alias: Name, scope: Scope? = currentScope): Type? {
Expand Down Expand Up @@ -960,16 +960,21 @@ class ScopeManager : ScopeProvider {
get() = currentScope

/**
* This function tries to resolve a [Node.name] to a list of symbols (a symbol represented by a
* [Declaration]) starting with [startScope]. This function can return a list of multiple
* symbols in order to check for things like function overloading. but it will only return list
* of symbols within the same scope; the list cannot be spread across different scopes.
* This function tries to convert a [Node.name] into a [Symbol] and then performs a lookup of
* this symbol. This can either be an "unqualified lookup" if [name] is not qualified or a
* "qualified lookup" if [Name.isQualified] is true. In the unqualified case the lookup starts
* in [startScope], in the qualified case we use [extractScope] to find the appropriate scope
* and need to restrict our search to this particular scope.
*
* This means that as soon one or more symbols are found in a "local" scope, these shadow all
* other occurrences of the same / symbol in a "higher" scope and only the ones from the lower
* ones will be returned.
* This function can return a list of multiple declarations in order to check for things like
* function overloading. But it will only return list of declarations within the same scope; the
* list cannot be spread across different scopes.
*
* This means that as soon one or more declarations for the symbol are found in a "local" scope,
* these shadow all other occurrences of the same / symbol in a "higher" scope and only the ones
* from the lower ones will be returned.
*/
fun findSymbols(
fun lookupSymbolByName(
name: Name,
location: PhysicalLocation? = null,
startScope: Scope? = currentScope,
Expand Down Expand Up @@ -1112,7 +1117,7 @@ data class CallResolutionResult(

/**
* A set of candidate symbols we discovered based on the [CallExpression.callee] (using
* [ScopeManager.findSymbols]), more specifically a list of [FunctionDeclaration] nodes.
* [ScopeManager.lookupSymbolByName]), more specifically a list of [FunctionDeclaration] nodes.
*/
var candidateFunctions: Set<FunctionDeclaration>,

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ class ImportResolver(ctx: TranslationContext) : ComponentPass(ctx) {

// Let's do some importing. We need to import either a wildcard
if (node.wildcardImport) {
val list = scopeManager.findSymbols(node.import, node.location, scope)
val list = scopeManager.lookupSymbolByName(node.import, node.location, scope)
val symbol = list.singleOrNull()
if (symbol != null) {
// In this case, the symbol must point to a name scope
Expand All @@ -69,7 +69,8 @@ class ImportResolver(ctx: TranslationContext) : ComponentPass(ctx) {
}
} else {
// or a symbol directly
val list = scopeManager.findSymbols(node.import, node.location, scope).toMutableList()
val list =
scopeManager.lookupSymbolByName(node.import, node.location, scope).toMutableList()
node.importedSymbols = mutableMapOf(node.symbol to list)
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ open class SymbolResolver(ctx: TranslationContext) : ComponentPass(ctx) {

// Find a list of candidate symbols. Currently, this is only used the in the "next-gen" call
// resolution, but in future this will also be used in resolving regular references.
current.candidates = scopeManager.findSymbols(current.name, current.location).toSet()
current.candidates = scopeManager.lookupSymbolByName(current.name, current.location).toSet()

// Preparation for a future without legacy call resolving. Taking the first candidate is not
// ideal since we are running into an issue with function pointers here (see workaround
Expand Down Expand Up @@ -679,7 +679,7 @@ open class SymbolResolver(ctx: TranslationContext) : ComponentPass(ctx) {
var candidates = mutableSetOf<Declaration>()
val records = possibleContainingTypes.mapNotNull { it.root.recordDeclaration }.toSet()
for (record in records) {
candidates.addAll(ctx.scopeManager.findSymbols(record.name.fqn(symbol)))
candidates.addAll(ctx.scopeManager.lookupSymbolByName(record.name.fqn(symbol)))
}

// Find invokes by supertypes
Expand Down Expand Up @@ -845,7 +845,7 @@ open class SymbolResolver(ctx: TranslationContext) : ComponentPass(ctx) {
listOf()
} else {
val firstLevelCandidates =
possibleTypes.map { scopeManager.findSymbols(it.name.fqn(name)) }.flatten()
possibleTypes.map { scopeManager.lookupSymbolByName(it.name.fqn(name)) }.flatten()

// C++ does not allow overloading at different hierarchy levels. If we find a
// FunctionDeclaration with the same name as the function in the CallExpression we have
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ open class TypeResolver(ctx: TranslationContext) : ComponentPass(ctx) {
// constructor declarations and such with the same name. It seems this is ok since most
// languages will prefer structs/classes over functions when resolving types.
var symbols =
ctx?.scopeManager?.findSymbols(type.name, startScope = type.scope) {
ctx?.scopeManager?.lookupSymbolByName(type.name, startScope = type.scope) {
it is DeclaresType
} ?: listOf()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ internal class ScopeManagerTest : BaseTest() {
// resolve symbol
val call =
frontend.newCallExpression(frontend.newReference("A::func1"), "A::func1", false)
val func = final.findSymbols(call.callee!!.name).firstOrNull()
val func = final.lookupSymbolByName(call.callee!!.name).firstOrNull()

assertEquals(func1, func)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ class GoExtraPass(ctx: TranslationContext) : ComponentPass(ctx) {

// Try to see if we already know about this namespace somehow
val namespace =
scopeManager.findSymbols(import.name, null).filter {
scopeManager.lookupSymbolByName(import.name, null).filter {
it is NamespaceDeclaration && it.path == import.importURL
}

Expand Down
1 change: 1 addition & 0 deletions docs/docs/CPG/impl/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,4 @@ the graph. These two stages are strictly separated one from each other.
* [Languages and Language Frontends](./language)
* [Scopes](./scopes)
* [Passes](./passes)
* [Symbol Resolution](./symbol-resolver.md)
97 changes: 94 additions & 3 deletions docs/docs/CPG/impl/scopes.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
---
title: "Implementation and Concepts - Scopes"
linkTitle: "Implementation and Concepts - Scopes"
title: "Implementation and Concepts - Scopes and Symbols"
linkTitle: "Implementation and Concepts - Scopes and Symbols"
weight: 20
no_list: false
menu:
Expand All @@ -11,5 +11,96 @@ description: >
---


# Implementation and Concepts: Scopes and Scope Manger
# Scopes and Symbols

The concept of scopes and symbols are at the heart of every programming language and thus are also the core of static analysis. Both concepts consist in the CPG library through the types `Scope` and `Symbol` respectively.

A "symbol" can be seen as an identifier in most programming languages, referring to variables or functions. Symbols are often grouped in scopes, which defines the visibility of a symbol, e.g. a slice of a program that can "see" the symbol. Often this is also synonymous with the life-time of a variable, e.g., that its memory will be freed (or collected by a garbage collector) once it goes "out of scope".

```c
// This defines a symbol "a" in the global/file scope.
// Its visibility is global within the file.
int a = 1;

int main() {
// this defines another symbol "a" in a function/block scope.
// Its visibility is limited to the block it is defined in.
int a = 1;
}
```

Usually symbols declared in a local scope override the declaration of a symbol in a higher (e.g., global scope), which is also referred to as "shadowing". This needs to be taken into account when resolving symbols to their declarations.

The `Scope` class holds all its symbols in the `Scope::symbols` property. More specifically, this property is a `SymbolMap`, which is a type alias to a map, whose key type is a `Symbol` and whose value type is a list of `Declaration` nodes. This is basically a symbol lookup table for all symbols in its scope. It is a map of a list because some programming languages have concepts like function overloading, which leads to the declaration of multiple `FunctionDeclaration` nodes under the same symbol in one scope. In the current implementation, a `Symbol` is just a typealias for a string, and it is always "local" to the scope, meaning that it MUST NOT contain any qualifier. If you want to refer to a fully qualified identifier, a `Name` must be used. In the future, we might consider merging the concepts of `Symbol` and `Name`.

For a frontend or pass developer, the main interaction point with scopes and symbols is through the `ScopeManager`. The scope manager is available to all nodes via the `TranslationContext` and also injected in frontend, handlers and passes.

## Hierarchy of Scopes

Each scope (except the `GlobalScope`) can have a parent and possible child scopes. This can be used to model a hierarchy of scopes within a program. For example using the snippet above, the following scopes are defined in the CPG:

* A `GlobalScope` that comprises the whole file
* A `FunctionScope` that comprises the function `main`
* A `BlockScope` that comprises the function body

Note, that each programming language is different when it comes to scoping and this needs to be thought of by a frontend developer. For example in C/C++ each block introduced by `{}` introduces a new scope and variables can be declared only for such a block, meaning that each `for`, `if` and other statements also introduce a new scope. In contrast, Python only differentiates between a global scope, function and class scope.

## Defining Scopes and Declaring Symbols

In order to define new scopes, the `ScopeManager` offers two main APIs:

* `enterScope(node)`, which specifies that `node` will declare a new scope and that an appropriate `Scope` (or derived type) will be created
* `leaveScope(node)`, which closes the scope again

It is important that every opened scope must also be closed again. When scopes are nested, they also need to be closed in reverse order.

```Kotlin
// We are inside the global scope here and want to create a new function
var func = newFunctionDeclaration("main")

// Create a function scope
scopeManager.enterScope(func)

// Create a block scope for the body because our language works this way
var body = newBlock()
func.body = body
scopeManager.enterScope(body)

// Add statements here
body.statements += /* ... */

// Leave block scope
scopeManager.leaveScope(body)

// Back to global scope, add the function to global scope
scopeManager.leaveScope(func)
scopeManager.addDeclaration(func)
```

Inside the scope, declarations can be added with `ScopeManager::addDeclaration`. This takes care of adding the declaration to an appropriate place in the AST (which beyond the scope of this document) and also adds the `Declaration` to the `Scope` under the appropriate `Symbol`.


## Looking up Symbols

During different analysis steps, e.g., in different passes, we want to find certain symbols or lookup the declaration(s) belonging to a particular symbol. There are two functions in order to do so - a "higher" level concept in the `ScopeManager` and a "lower" level function on the `Scope` itself.

The lower level one is called `Scope::lookupSymbol` and can be used to retrieve a list of `Declaration` nodes that belong to a particular `Symbol` that is "visible" the scope. It does so by first looking through its own `Scope::symbols`. If no match was found, the scope is traversed upwards to its `Scope::parent`, until a match is found. Furthermore, additional logic is needed to resolve symbol that are pointing to another scope, e.g., because they represent an `ImportDeclaration`.

```Kotlin
var scope = /* ... */
var declarations = scope.lookupSymbol("a") {
// Some additional predicate if we want
}
```

Additionally, the lookup can be fine-tuned by an additional predicate. However, this should be used carefully as it restricts the possible list of symbols very early. In most cases the list of symbols should be quite exhaustive at first to find all possible candidates and then selecting the best candidate in a second step (e.g., based on argument types for a function call).

While the aforementioned API works great if we already have a specific start scope and local `Symbol`, we often start our resolution process with a `Name` -- which could potentially be qualified, such as `std::string`. Therefore, the "higher level" function `ScopeManager::lookupSymbolByName` can be used to retrieve a list of candidate declarations by a given `Name`. In a first step, the name is checked for a potential scope qualifier (`std` in this example). If present, it is extracted and the search scope is set to it. This is what is usually referred to as a "qualified lookup". Otherwise, the local part of the name is used to start the lookup, in what is called an "unqualified lookup". In both cases, the actual lookup is delegated to `ScopeManager::lookupSymbols`, but with different parameters.

```Kotlin
var name = parseName("std::string")
// This will return all the 'string' symbols within the 'std' name scope
var stringSymbols = scopeManager.lookupSymbolByName(name)
```

Developers should avoid symbol lookup during frontend parsing, since often during parsing, only a limited view of all symbols is available. Instead, a dedicated pass that is run on the complete translation result is the preferred option. Apart from that, the main usage of this API is in the [SymbolResolver](symbol-resolver.md).
38 changes: 38 additions & 0 deletions docs/docs/CPG/impl/symbol-resolver.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
---
title: "Implementation and Concepts - Symbol Resolution"
linkTitle: "Implementation and Concepts - Symbol Resolution"
weight: 20
no_list: false
menu:
main:
weight: 20
description: >
The CPG library is a language-agnostic graph representation of source code.
---


# Symbol Resolution

This pages describes the main functionality behind symbol resolution in the CPG library. This is mostly done by the `SymbolResolver` pass, in combination with the symbol lookup API (see [Scopes and Symbols](scopes.md#looking-up-symbols)). In addition to the *lookup* of a symbol, the *resolution* takes the input of the lookup and provides a "definite" decision which symbol is used. This mostly referred to symbols / names used in a `Reference` or a `CallExpression` (which also has a reference as its `CallExpression::callee`).

## The `SymbolResolver` Pass

The `SymbolResolver` pass takes care of the heavy lifting of symbol (or rather reference) resolving:

* It sets the `Reference::refersTo` property,
* and sets the `CallExpression::invokes` property,
* and finally takes cares of operator overloading (if the language supports it).

In a way, it can be compared to a linker step in a compiler. The pass operates on a single `Component` and starts by identifying EOG starter nodes within the component. These node "start" an EOG sub-graph, i.e., they do not have any previous EOG edges. The symbol resolver uses the `ScopedWalker` with a special set-up that traverses the EOG starting with each EOG starter node until it reaches the end. This ensures that symbols are resolved in the correct order of "evaluation", e.g., that a base of a member expression is resolved before the expression itself. This ensures that necessary type information on the base are available in order to resolve appropriate fields of the member expression.

The symbol resolver itself has gone through many re-writes over the years and there is still some code left that we consider *legacy*. These functions are marked as such, and we aim to remove them slowly.

## Resolving References

The main functionality lies in `ScopeManager::handleReference`. For all `Reference` nodes (that are not `MemberExpression` nodes) we use the symbol lookup API to find declaration candidates for the name the reference is referring to. This candidate list is then stored in `Reference::candidates`. If the reference is the `CallExpression::callee` property of a call, we abort here and jump to [Resolve Calls](#resolve-calls).

Otherwise, we currently take the first entry of the candidate list and set the `Reference::refersTo` property to it.

## Resolve Calls

Prequisite: The `CallExpression::callee` reference must have been resolved (see [Resolving References](#resolving-references)).
3 changes: 2 additions & 1 deletion docs/mkdocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,9 @@ nav:
- "Implementation":
- CPG/impl/index.md
- "Language Frontends": CPG/impl/language.md
- "Scopes": CPG/impl/scopes.md
- "Scopes and Symbols": CPG/impl/scopes.md
- "Passes": CPG/impl/passes.md
- "Symbol Resolution": CPG/impl/symbol-resolver.md
- "Contributing":
- "Contributing to the CPG library": Contributing/index.md
# This assumes that the most recent dokka build was generated with the "main" tag!
Expand Down

0 comments on commit 51cb57e

Please sign in to comment.