API Reference

Basic Matching

`fuzzybunny.levenshtein(s1, s2)`

Source code in src/fuzzybunny/_fuzzybunny.pyi

def levenshtein(s1: str, s2: str) -> float: ...

`fuzzybunny.partial_ratio(s1, s2)`

Source code in src/fuzzybunny/_fuzzybunny.pyi

def partial_ratio(s1: str, s2: str) -> float: ...

`fuzzybunny.jaccard(s1, s2)`

Source code in src/fuzzybunny/_fuzzybunny.pyi

def jaccard(s1: str, s2: str) -> float: ...

`fuzzybunny.token_sort(s1, s2)`

Source code in src/fuzzybunny/_fuzzybunny.pyi

def token_sort(s1: str, s2: str) -> float: ...

`fuzzybunny.token_set(s1, s2)`

Source code in src/fuzzybunny/_fuzzybunny.pyi

def token_set(s1: str, s2: str) -> float: ...

`fuzzybunny.qratio(s1, s2)`

Source code in src/fuzzybunny/_fuzzybunny.pyi

def qratio(s1: str, s2: str) -> float: ...

`fuzzybunny.wratio(s1, s2)`

Source code in src/fuzzybunny/_fuzzybunny.pyi

def wratio(s1: str, s2: str) -> float: ...

Ranking

`fuzzybunny.rank(query, candidates, scorer='levenshtein', mode='full', process=True, threshold=0.0, top_n=-1, weights=None)`

Ranks a list of candidates based on their similarity to a query string.

This is the primary function for finding the best matches in a collection. It supports multiple scoring algorithms, threshold filtering, and integrated string normalization.

Parameters:

Name	Type	Description	Default
`query`	`str`	The string to search for.	required
`candidates`	`CandidatesType`	A collection of strings to search through. Can be a list, pandas.Series, or numpy.ndarray.	required
`scorer`	`Union[str, Callable[[str, str], float]]`	The similarity algorithm to use. Options include: - `"levenshtein"`: Standard edit distance ratio. - `"wratio"`: Weighted combination of multiple algorithms (recommended). - `"qratio"`: Simplified Levenshtein ratio. - `"token_sort"`: Sorts tokens before comparison. - `"token_set"`: Set-based comparison (handles duplicates and order). - `"jaccard"`: Jaccard similarity between token sets. - Or a custom `Callable[[str, str], float]`.	`'levenshtein'`
`mode`	`str`	Matching mode. - `"full"`: Matches the entire candidate string. - `"partial"`: Finds the best substring match.	`'full'`
`process`	`bool`	If True, applies normalization (lowercasing, punctuation removal) before matching.	`True`
`threshold`	`float`	Minimum score (0.0 to 1.0) for a candidate to be included in the results.	`0.0`
`top_n`	`int`	Maximum number of results to return. Use -1 for all matches.	`-1`
`weights`	`Dict[str, float]`	Dictionary of weights for the `"hybrid"` scorer.	`None`

Returns:

Type	Description
`List[Tuple[str, float]]`	A list of tuples containing (matched_string, similarity_score),
`List[Tuple[str, float]]`	sorted by score in descending order.

Examples:

>>> import fuzzybunny
>>> fuzzybunny.rank("apple", ["apple pie", "banana", "apricot"])
[('apple pie', 0.5555555555555556), ('apricot', 0.42857142857142855)]

>>> # Partial matching
>>> fuzzybunny.rank("apple", ["apple pie"], mode="partial")
[('apple pie', 1.0)]

Source code in src/fuzzybunny/__init__.py

def rank(
    query: str, 
    candidates: CandidatesType, 
    scorer: Union[str, Callable[[str, str], float]] = "levenshtein", 
    mode: str = "full", 
    process: bool = True, 
    threshold: float = 0.0, 
    top_n: int = -1, 
    weights: Dict[str, float] = None
) -> List[Tuple[str, float]]:
    """
    Ranks a list of candidates based on their similarity to a query string.

    This is the primary function for finding the best matches in a collection. It supports
    multiple scoring algorithms, threshold filtering, and integrated string normalization.

    Args:
        query: The string to search for.
        candidates: A collection of strings to search through. Can be a list, 
            pandas.Series, or numpy.ndarray.
        scorer: The similarity algorithm to use. Options include:
            - `"levenshtein"`: Standard edit distance ratio.
            - `"wratio"`: Weighted combination of multiple algorithms (recommended).
            - `"qratio"`: Simplified Levenshtein ratio.
            - `"token_sort"`: Sorts tokens before comparison.
            - `"token_set"`: Set-based comparison (handles duplicates and order).
            - `"jaccard"`: Jaccard similarity between token sets.
            - Or a custom `Callable[[str, str], float]`.
        mode: Matching mode. 
            - `"full"`: Matches the entire candidate string.
            - `"partial"`: Finds the best substring match.
        process: If True, applies normalization (lowercasing, punctuation removal) 
            before matching.
        threshold: Minimum score (0.0 to 1.0) for a candidate to be included in 
            the results.
        top_n: Maximum number of results to return. Use -1 for all matches.
        weights: Dictionary of weights for the `"hybrid"` scorer.

    Returns:
        A list of tuples containing (matched_string, similarity_score), 
        sorted by score in descending order.

    Examples:
        >>> import fuzzybunny
        >>> fuzzybunny.rank("apple", ["apple pie", "banana", "apricot"])
        [('apple pie', 0.5555555555555556), ('apricot', 0.42857142857142855)]

        >>> # Partial matching
        >>> fuzzybunny.rank("apple", ["apple pie"], mode="partial")
        [('apple pie', 1.0)]
    """
    if weights is None:
        weights = {}

    # Check for pandas/numpy
    if _is_pandas_series(candidates):
        candidates = candidates.astype(str).tolist()
    elif _is_numpy_array(candidates):
        import numpy as np
        candidates = np.array(candidates).astype(str).tolist()
    elif not isinstance(candidates, (list, tuple)):
        candidates = list(candidates)

    return _fuzzybunny.rank(query, candidates, scorer, mode, process, threshold, top_n, weights)

`fuzzybunny.batch_match(queries, candidates, scorer='levenshtein', mode='full', process=True, threshold=0.0, top_n=-1, weights=None)`

Efficiently matches multiple queries against a collection of candidates.

Utilizes multi-threading (OpenMP) and internal string normalization caching to provide high-performance batch processing.

Parameters:

Name	Type	Description	Default
`queries`	`QueriesType`	A collection of strings to match.	required
`candidates`	`CandidatesType`	A collection of target strings to search through.	required
`scorer`	`Union[str, Callable[[str, str], float]]`	See `rank` for available options.	`'levenshtein'`
`mode`	`str`	See `rank`.	`'full'`
`process`	`bool`	See `rank`.	`True`
`threshold`	`float`	See `rank`.	`0.0`
`top_n`	`int`	Maximum number of results per query.	`-1`
`weights`	`Dict[str, float]`	See `rank`.	`None`

Returns:

Type	Description
`List[List[Tuple[str, float]]]`	A list of result lists, where each inner list corresponds to a query.

Note

This function is significantly faster than calling rank in a loop for large datasets due to parallelization and reduced overhead.

Source code in src/fuzzybunny/__init__.py

def batch_match(
    queries: QueriesType, 
    candidates: CandidatesType, 
    scorer: Union[str, Callable[[str, str], float]] = "levenshtein", 
    mode: str = "full", 
    process: bool = True, 
    threshold: float = 0.0, 
    top_n: int = -1, 
    weights: Dict[str, float] = None
) -> List[List[Tuple[str, float]]]:
    """
    Efficiently matches multiple queries against a collection of candidates.

    Utilizes multi-threading (OpenMP) and internal string normalization caching
    to provide high-performance batch processing.

    Args:
        queries: A collection of strings to match.
        candidates: A collection of target strings to search through.
        scorer: See `rank` for available options.
        mode: See `rank`.
        process: See `rank`.
        threshold: See `rank`.
        top_n: Maximum number of results per query.
        weights: See `rank`.

    Returns:
        A list of result lists, where each inner list corresponds to a query.

    Note:
        This function is significantly faster than calling `rank` in a loop
        for large datasets due to parallelization and reduced overhead.
    """
    if weights is None:
        weights = {}

    if _is_pandas_series(candidates):
        candidates = candidates.astype(str).tolist()
    elif _is_numpy_array(candidates):
        import numpy as np
        candidates = np.array(candidates).astype(str).tolist()
    elif not isinstance(candidates, (list, tuple)):
        candidates = list(candidates)

    # queries can also be pandas/numpy
    if _is_pandas_series(queries) or _is_numpy_array(queries):
        import numpy as np
        queries = np.array(queries).astype(str).tolist()
    elif not isinstance(queries, (list, tuple)):
        queries = list(queries)

    return _fuzzybunny.batch_match(queries, candidates, scorer, mode, process, threshold, top_n, weights)

Utilities

`fuzzybunny.benchmark.benchmark(query, candidates, scorers=None, n_runs=5)`

Benchmark different scorers on a given query and set of candidates. Returns a dictionary with timing results.

Source code in src/fuzzybunny/benchmark.py

def benchmark(query, candidates, scorers=None, n_runs=5):
    """
    Benchmark different scorers on a given query and set of candidates.
    Returns a dictionary with timing results.
    """
    from . import rank
    if scorers is None:
        scorers = ["levenshtein", "jaccard", "token_sort"]

    results = {}

    for scorer in scorers:
        times = []
        for _ in range(n_runs):
            start = time.perf_counter()
            rank(query, candidates, scorer=scorer)
            end = time.perf_counter()
            times.append(end - start)

        results[scorer] = {
            "mean": statistics.mean(times),
            "stddev": statistics.stdev(times) if len(times) > 1 else 0,
            "min": min(times),
            "max": max(times)
        }

    return results

`fuzzybunny.benchmark.benchmark_batch(queries, candidates, scorer='levenshtein', n_runs=3)`

Benchmark batch_match performance.

Source code in src/fuzzybunny/benchmark.py

def benchmark_batch(queries, candidates, scorer="levenshtein", n_runs=3):
    """
    Benchmark batch_match performance.
    """
    from . import batch_match
    times = []
    for _ in range(n_runs):
        start = time.perf_counter()
        batch_match(queries, candidates, scorer=scorer)
        end = time.perf_counter()
        times.append(end - start)

    return {
        "mean": statistics.mean(times),
        "total_queries": len(queries),
        "total_candidates": len(candidates),
        "queries_per_second": len(queries) / statistics.mean(times)
    }