Skip to content

API Reference

Basic Matching

fuzzybunny.levenshtein(s1, s2)

Source code in src/fuzzybunny/_fuzzybunny.pyi
def levenshtein(s1: str, s2: str) -> float: ...

fuzzybunny.partial_ratio(s1, s2)

Source code in src/fuzzybunny/_fuzzybunny.pyi
def partial_ratio(s1: str, s2: str) -> float: ...

fuzzybunny.jaccard(s1, s2)

Source code in src/fuzzybunny/_fuzzybunny.pyi
def jaccard(s1: str, s2: str) -> float: ...

fuzzybunny.token_sort(s1, s2)

Source code in src/fuzzybunny/_fuzzybunny.pyi
def token_sort(s1: str, s2: str) -> float: ...

fuzzybunny.token_set(s1, s2)

Source code in src/fuzzybunny/_fuzzybunny.pyi
def token_set(s1: str, s2: str) -> float: ...

fuzzybunny.qratio(s1, s2)

Source code in src/fuzzybunny/_fuzzybunny.pyi
def qratio(s1: str, s2: str) -> float: ...

fuzzybunny.wratio(s1, s2)

Source code in src/fuzzybunny/_fuzzybunny.pyi
def wratio(s1: str, s2: str) -> float: ...

Ranking

fuzzybunny.rank(query, candidates, scorer='levenshtein', mode='full', process=True, threshold=0.0, top_n=-1, weights=None)

Ranks a list of candidates based on their similarity to a query string.

This is the primary function for finding the best matches in a collection. It supports multiple scoring algorithms, threshold filtering, and integrated string normalization.

Parameters:

Name Type Description Default
query str

The string to search for.

required
candidates CandidatesType

A collection of strings to search through. Can be a list, pandas.Series, or numpy.ndarray.

required
scorer Union[str, Callable[[str, str], float]]

The similarity algorithm to use. Options include: - "levenshtein": Standard edit distance ratio. - "wratio": Weighted combination of multiple algorithms (recommended). - "qratio": Simplified Levenshtein ratio. - "token_sort": Sorts tokens before comparison. - "token_set": Set-based comparison (handles duplicates and order). - "jaccard": Jaccard similarity between token sets. - Or a custom Callable[[str, str], float].

'levenshtein'
mode str

Matching mode. - "full": Matches the entire candidate string. - "partial": Finds the best substring match.

'full'
process bool

If True, applies normalization (lowercasing, punctuation removal) before matching.

True
threshold float

Minimum score (0.0 to 1.0) for a candidate to be included in the results.

0.0
top_n int

Maximum number of results to return. Use -1 for all matches.

-1
weights Dict[str, float]

Dictionary of weights for the "hybrid" scorer.

None

Returns:

Type Description
List[Tuple[str, float]]

A list of tuples containing (matched_string, similarity_score),

List[Tuple[str, float]]

sorted by score in descending order.

Examples:

>>> import fuzzybunny
>>> fuzzybunny.rank("apple", ["apple pie", "banana", "apricot"])
[('apple pie', 0.5555555555555556), ('apricot', 0.42857142857142855)]
>>> # Partial matching
>>> fuzzybunny.rank("apple", ["apple pie"], mode="partial")
[('apple pie', 1.0)]
Source code in src/fuzzybunny/__init__.py
def rank(
    query: str, 
    candidates: CandidatesType, 
    scorer: Union[str, Callable[[str, str], float]] = "levenshtein", 
    mode: str = "full", 
    process: bool = True, 
    threshold: float = 0.0, 
    top_n: int = -1, 
    weights: Dict[str, float] = None
) -> List[Tuple[str, float]]:
    """
    Ranks a list of candidates based on their similarity to a query string.

    This is the primary function for finding the best matches in a collection. It supports
    multiple scoring algorithms, threshold filtering, and integrated string normalization.

    Args:
        query: The string to search for.
        candidates: A collection of strings to search through. Can be a list, 
            pandas.Series, or numpy.ndarray.
        scorer: The similarity algorithm to use. Options include:
            - `"levenshtein"`: Standard edit distance ratio.
            - `"wratio"`: Weighted combination of multiple algorithms (recommended).
            - `"qratio"`: Simplified Levenshtein ratio.
            - `"token_sort"`: Sorts tokens before comparison.
            - `"token_set"`: Set-based comparison (handles duplicates and order).
            - `"jaccard"`: Jaccard similarity between token sets.
            - Or a custom `Callable[[str, str], float]`.
        mode: Matching mode. 
            - `"full"`: Matches the entire candidate string.
            - `"partial"`: Finds the best substring match.
        process: If True, applies normalization (lowercasing, punctuation removal) 
            before matching.
        threshold: Minimum score (0.0 to 1.0) for a candidate to be included in 
            the results.
        top_n: Maximum number of results to return. Use -1 for all matches.
        weights: Dictionary of weights for the `"hybrid"` scorer.

    Returns:
        A list of tuples containing (matched_string, similarity_score), 
        sorted by score in descending order.

    Examples:
        >>> import fuzzybunny
        >>> fuzzybunny.rank("apple", ["apple pie", "banana", "apricot"])
        [('apple pie', 0.5555555555555556), ('apricot', 0.42857142857142855)]

        >>> # Partial matching
        >>> fuzzybunny.rank("apple", ["apple pie"], mode="partial")
        [('apple pie', 1.0)]
    """
    if weights is None:
        weights = {}

    # Check for pandas/numpy
    if _is_pandas_series(candidates):
        candidates = candidates.astype(str).tolist()
    elif _is_numpy_array(candidates):
        import numpy as np
        candidates = np.array(candidates).astype(str).tolist()
    elif not isinstance(candidates, (list, tuple)):
        candidates = list(candidates)

    return _fuzzybunny.rank(query, candidates, scorer, mode, process, threshold, top_n, weights)

fuzzybunny.batch_match(queries, candidates, scorer='levenshtein', mode='full', process=True, threshold=0.0, top_n=-1, weights=None)

Efficiently matches multiple queries against a collection of candidates.

Utilizes multi-threading (OpenMP) and internal string normalization caching to provide high-performance batch processing.

Parameters:

Name Type Description Default
queries QueriesType

A collection of strings to match.

required
candidates CandidatesType

A collection of target strings to search through.

required
scorer Union[str, Callable[[str, str], float]]

See rank for available options.

'levenshtein'
mode str

See rank.

'full'
process bool

See rank.

True
threshold float

See rank.

0.0
top_n int

Maximum number of results per query.

-1
weights Dict[str, float]

See rank.

None

Returns:

Type Description
List[List[Tuple[str, float]]]

A list of result lists, where each inner list corresponds to a query.

Note

This function is significantly faster than calling rank in a loop for large datasets due to parallelization and reduced overhead.

Source code in src/fuzzybunny/__init__.py
def batch_match(
    queries: QueriesType, 
    candidates: CandidatesType, 
    scorer: Union[str, Callable[[str, str], float]] = "levenshtein", 
    mode: str = "full", 
    process: bool = True, 
    threshold: float = 0.0, 
    top_n: int = -1, 
    weights: Dict[str, float] = None
) -> List[List[Tuple[str, float]]]:
    """
    Efficiently matches multiple queries against a collection of candidates.

    Utilizes multi-threading (OpenMP) and internal string normalization caching
    to provide high-performance batch processing.

    Args:
        queries: A collection of strings to match.
        candidates: A collection of target strings to search through.
        scorer: See `rank` for available options.
        mode: See `rank`.
        process: See `rank`.
        threshold: See `rank`.
        top_n: Maximum number of results per query.
        weights: See `rank`.

    Returns:
        A list of result lists, where each inner list corresponds to a query.

    Note:
        This function is significantly faster than calling `rank` in a loop
        for large datasets due to parallelization and reduced overhead.
    """
    if weights is None:
        weights = {}

    if _is_pandas_series(candidates):
        candidates = candidates.astype(str).tolist()
    elif _is_numpy_array(candidates):
        import numpy as np
        candidates = np.array(candidates).astype(str).tolist()
    elif not isinstance(candidates, (list, tuple)):
        candidates = list(candidates)

    # queries can also be pandas/numpy
    if _is_pandas_series(queries) or _is_numpy_array(queries):
        import numpy as np
        queries = np.array(queries).astype(str).tolist()
    elif not isinstance(queries, (list, tuple)):
        queries = list(queries)

    return _fuzzybunny.batch_match(queries, candidates, scorer, mode, process, threshold, top_n, weights)

Utilities

fuzzybunny.benchmark.benchmark(query, candidates, scorers=None, n_runs=5)

Benchmark different scorers on a given query and set of candidates. Returns a dictionary with timing results.

Source code in src/fuzzybunny/benchmark.py
def benchmark(query, candidates, scorers=None, n_runs=5):
    """
    Benchmark different scorers on a given query and set of candidates.
    Returns a dictionary with timing results.
    """
    from . import rank
    if scorers is None:
        scorers = ["levenshtein", "jaccard", "token_sort"]

    results = {}

    for scorer in scorers:
        times = []
        for _ in range(n_runs):
            start = time.perf_counter()
            rank(query, candidates, scorer=scorer)
            end = time.perf_counter()
            times.append(end - start)

        results[scorer] = {
            "mean": statistics.mean(times),
            "stddev": statistics.stdev(times) if len(times) > 1 else 0,
            "min": min(times),
            "max": max(times)
        }

    return results

fuzzybunny.benchmark.benchmark_batch(queries, candidates, scorer='levenshtein', n_runs=3)

Benchmark batch_match performance.

Source code in src/fuzzybunny/benchmark.py
def benchmark_batch(queries, candidates, scorer="levenshtein", n_runs=3):
    """
    Benchmark batch_match performance.
    """
    from . import batch_match
    times = []
    for _ in range(n_runs):
        start = time.perf_counter()
        batch_match(queries, candidates, scorer=scorer)
        end = time.perf_counter()
        times.append(end - start)

    return {
        "mean": statistics.mean(times),
        "total_queries": len(queries),
        "total_candidates": len(candidates),
        "queries_per_second": len(queries) / statistics.mean(times)
    }