一、前置条件
在开始之前,请确保:
- 已安装
redisvl
并激活相应的 Python 环境。 - 运行 Redis 实例,且 RediSearch 版本 > 2.4。
二、初始化与数据加载
我们将使用一个包含用户信息的数据集,字段包括 user
、age
、job
、credit_score
、office_location
、user_embedding
和 last_updated
。以下是初始化索引和加载数据的代码:
import pickle
from redisvl.index import SearchIndex# 加载示例数据
data = pickle.load(open("hybrid_example_data.pkl", "rb"))# 定义索引架构
schema = {"index": {"name": "user_queries","prefix": "user_queries_docs","storage_type": "hash",},"fields": [{"name": "user", "type": "tag"},{"name": "credit_score", "type": "tag"},{"name": "job", "type": "text"},{"name": "age", "type": "numeric"},{"name": "last_updated", "type": "numeric"},{"name": "office_location", "type": "geo"},{"name": "user_embedding","type": "vector","attrs": {"dims": 3,"distance_metric": "cosine","algorithm": "flat","datatype": "float32"}}],
}# 创建搜索索引
index = SearchIndex.from_dict(schema, redis_url="redis://localhost:6379")
index.create(overwrite=True)# 加载数据
keys = index.load(data)
print(index.info()['num_docs']) # 输出:7
使用 rvl
CLI 检查索引:
rvl index listall
三、混合查询
混合查询结合多种过滤器,例如根据年龄、职业和地理位置进行筛选。以下展示不同类型的过滤器及其应用。
3.1.标签过滤器(Tag Filters)
标签过滤器用于对分类字段(如 credit_score
)进行精确匹配。
from redisvl.query import VectorQuery
from redisvl.query.filter import Tag# 筛选信用评分为 "high" 的用户
t = Tag("credit_score") == "high"
v = VectorQuery(vector=[0.1, 0.1, 0.5],vector_field_name="user_embedding",return_fields=["user", "credit_score", "age", "job", "office_location", "last_updated"],filter_expression=t
)
results = index.query(v)
输出:
vector_distance user credit_score age job office_location last_updated
0 john high 18 engineer -122.4194,37.7749 1741627789
0.109129190445 tyler high 100 engineer -122.0839,37.3861 1742232589
0.158808946609 tim high 12 dermatologist -122.0839,37.3861 1739644189
0.266666650772 nancy high 94 doctor -122.4194,37.7749 1710696589
支持否定和多值匹配:
# 否定:非 "high" 信用评分
t = Tag("credit_score") != "high"
v.set_filter(t)
results = index.query(v)
# 多值匹配:信用评分为 "high" 或 "medium"
t = Tag("credit_score") == ["high", "medium"]
v.set_filter(t)
results = index.query(v)
空标签列表会优雅地回退为通配符查询:
t = Tag("credit_score") == []
v.set_filter(t)
results = index.query(v)
3.2.数值过滤器(Numeric Filters)
数值过滤器用于筛选数值字段的范围或精确值。
from redisvl.query.filter import Num# 筛选年龄在 15-35 岁的用户
numeric_filter = Num("age").between(15, 35)
v.set_filter(numeric_filter)
results = index.query(v)
输出:
vector_distance user credit_score age job office_location last_updated
0 john high 18 engineer -122.4194,37.7749 1741627789
0.217882037163 taimur low 15 CEO -122.0839,37.3861 1742232589
0.653301358223 joe medium 35 dentist -122.0839,37.3861 1742232589
支持精确匹配和否定:
# 精确匹配:年龄为 14
numeric_filter = Num("age") == 14
v.set_filter(numeric_filter)
# 否定:年龄不为 14
numeric_filter = Num("age") != 14
v.set_filter(numeric_filter)
3.3.时间戳过滤器(Timestamp Filters)
时间戳过滤器支持使用 Python 的 datetime
对象进行时间筛选。
from redisvl.query.filter import Timestamp
from datetime import datetimedt = datetime(2025, 3, 16, 13, 45, 39, 132589)
timestamp_filter = Timestamp("last_updated") > dt
v.set_filter(timestamp_filter)
results = index.query(v)
输出:
vector_distance user credit_score age job office_location last_updated
0.109129190445 tyler high 100 engineer -122.0839,37.3861 1742232589
0.217882037163 taimur low 15 CEO -122.0839,37.3861 1742232589
0.653301358223 joe medium 35 dentist -122.0839,37.3861 1742232589
支持范围查询:
dt_1 = datetime(2025, 1, 14, 13, 45, 39, 132589)
dt_2 = datetime(2025, 3, 16, 13, 45, 39, 132589)
timestamp_filter = Timestamp("last_updated").between(dt_1, dt_2)
v.set_filter(timestamp_filter)
3.4.文本过滤器(Text Filters)
文本过滤器用于对文本字段进行精确、模糊或通配符匹配。
from redisvl.query.filter import Text# 精确匹配:职业为 "doctor"
text_filter = Text("job") == "doctor"
v.set_filter(text_filter)
输出:
vector_distance user credit_score age job office_location last_updated
0 derrick low 14 doctor -122.4194,37.7749 1741627789
0.266666650772 nancy high 94 doctor -122.4194,37.7749 1710696589
支持通配符和模糊匹配:
# 通配符:职业以 "doct" 开头
wildcard_filter = Text("job") % "doct*"
v.set_filter(wildcard_filter)
# 模糊匹配:职业包含 "engine"
fuzzy_match = Text("job") % "%%engine%%"
v.set_filter(fuzzy_match)
支持条件匹配:
# 条件匹配:职业为 "engineer" 或 "doctor"
conditional = Text("job") % "engineer|doctor"
v.set_filter(conditional)
3.5.地理过滤器(Geo Filters)
地理过滤器用于筛选指定位置和半径范围内的记录。
from redisvl.query.filter import Geo, GeoRadius# 筛选距离旧金山办公室 10 公里内的用户
geo_filter = Geo("office_location") == GeoRadius(-122.4194, 37.7749, 10, "km")
v.set_filter(geo_filter)
输出:
score vector_distance user credit_score age job office_location
0.454545444693 0 john high 18 engineer -122.4194,37.7749
0.454545444693 0 derrick low 14 doctor -122.4194,37.7749
0.454545444693 0.266666650772 nancy high 94 doctor -122.4194,37.7749
支持否定查询:
# 非 10 公里范围内的用户
geo_filter = Geo("office_location") != GeoRadius(-122.4194, 37.7749, 10, "km")
v.set_filter(geo_filter)
3.6.组合过滤器
通过 &
(交集)和 |
(并集)操作符组合多种过滤器。
t = Tag("credit_score") == "high"
low = Num("age") >= 18
high = Num("age") <= 100
ts = Timestamp("last_updated") > datetime(2025, 3, 16, 13, 45, 39, 132589)
combined = t & low & high & ts
v = VectorQuery([0.1, 0.1, 0.5],"user_embedding",return_fields=["user", "credit_score", "age", "job", "office_location"],filter_expression=combined
)
results = index.query(v)
输出:
vector_distance user credit_score age job office_location
0.109129190445 tyler high 100 engineer -122.0839,37.3861
并集查询:
low = Num("age") < 18
high = Num("age") > 93
combined = low | high
v.set_filter(combined)
动态组合过滤器:
def make_filter(age=None, credit=None, job=None):flexible_filter = ((Num("age") > age) &(Tag("credit_score") == credit) &(Text("job") % job))return flexible_filter# 示例:筛选年龄 > 18,信用评分为 high,职业为 engineer
combined = make_filter(age=18, credit="high", job="engineer")
v.set_filter(combined)
results = index.query(v)
四、非向量查询
使用 FilterQuery
执行类似 SQL 的非向量查询:
from redisvl.query import FilterQueryhas_low_credit = Tag("credit_score") == "low"
filter_query = FilterQuery(return_fields=["user", "credit_score", "age", "job", "location"],filter_expression=has_low_credit
)
results = index.query(filter_query)
输出:
user credit_score age job
derrick low 14 doctor
taimur low 15 CEO
五、计数查询
使用 CountQuery
统计符合条件的记录数:
from redisvl.query import CountQueryhas_low_credit = Tag("credit_score") == "low"
filter_query = CountQuery(filter_expression=has_low_credit)
count = index.query(filter_query)
print(f"{count} records match the filter expression {str(has_low_credit)}")
输出:
4 records match the filter expression @credit_score:{low}
六、范围查询
RangeQuery
用于筛选向量距离在指定阈值内的记录:
from redisvl.query import RangeQueryrange_query = RangeQuery(vector=[0.1, 0.1, 0.5],vector_field_name="user_embedding",return_fields=["user", "credit_score", "age", "job", "location"],distance_threshold=0.2
)
results = index.query(range_query)
输出:
vector_distance user credit_score age job
0 john high 18 engineer
0 derrick low 14 doctor
0.109129190445 tyler high 100 engineer
0.158808946609 tim high 12 dermatologist
调整距离阈值:
range_query.set_distance_threshold(0.1)
results = index.query(range_query)
结合过滤器:
is_engineer = Text("job") == "engineer"
range_query.set_filter(is_engineer)
results = index.query(range_query)
七、高级查询修饰符
支持排序、方言选择等高级功能:
v = VectorQuery(vector=[0.1, 0.1, 0.5],vector_field_name="user_embedding",return_fields=["user", "credit_score", "age", "job", "office_location"],num_results=5,filter_expression=is_engineer
).sort_by("age", asc=False).dialect(3)
results = index.query(v)
输出:
vector_distance age user credit_score job office_location
0.109129190445 100 tyler high engineer -122.0839,37.3861
0 18 john high engineer -122.4194,37.7749
八、原始 Redis 查询字符串
将查询转换为原始 Redis 查询字符串:
str(v)
输出:
@job:("engineer")=>[KNN 5 @user_embedding $vector AS vector_distance] RETURN 6 user credit_score age job office_location vector_distance SORTBY age DESC DIALECT 3 LIMIT 0 5
直接使用原始查询字符串:
results = index.search("@credit_score:{high}")
for r in results.docs:print(r.__dict__)
九、清理
删除索引:
index.delete()
十、总结
RedisVL 提供了灵活的查询接口,支持标签、数值、时间戳、文本、地理等多种过滤器,以及向量、非向量、计数和范围查询。通过组合过滤器和动态参数化,开发者可以构建高效的搜索应用,适用于从简单到复杂的场景。更多查询修饰符和 API 详情,请参阅 RedisVL 官方文档。