Skip to content

Commit a06a550

Browse files
committed
Custom Tokenizers
This brings preliminary support for custom FTS tokenizers, registered using the following incantation: db.register(tokenizer: "name") { input in // ... extract first token and range here return (token, range) // return normalized token and range } Signed-off-by: Stephen Celis <stephen@stephencelis.com>
1 parent 5ac56d7 commit a06a550

File tree

7 files changed

+452
-93
lines changed

7 files changed

+452
-93
lines changed

SQLite Tests/FTSTests.swift

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,4 +32,30 @@ class FTSTests: SQLiteTestCase {
3232
AssertSQL("SELECT * FROM \"emails\" WHERE (\"emails\" MATCH 'hello')", emails.filter(match("hello", emails)))
3333
}
3434

35+
func test_registerTokenizer_registersTokenizer() {
36+
let locale = CFLocaleCopyCurrent()
37+
let tokenizer = CFStringTokenizerCreate(nil, "", CFRangeMake(0, 0), UInt(kCFStringTokenizerUnitWord), locale)
38+
39+
db.register(tokenizer: "tokenizer") { string in
40+
CFStringTokenizerSetString(tokenizer, string, CFRangeMake(0, CFStringGetLength(string)))
41+
if CFStringTokenizerAdvanceToNextToken(tokenizer) == .None {
42+
return nil
43+
}
44+
let range = CFStringTokenizerGetCurrentTokenRange(tokenizer)
45+
let input = CFStringCreateWithSubstring(kCFAllocatorDefault, string, range)
46+
var token = CFStringCreateMutableCopy(nil, range.length, input)
47+
CFStringLowercase(token, locale)
48+
CFStringTransform(token, nil, kCFStringTransformStripDiacritics, 0)
49+
return (token as String, string.rangeOfString(input as String)!)
50+
}
51+
52+
db.create(vtable: emails, using: fts4([subject, body], tokenize: .Custom("tokenizer")))
53+
54+
AssertSQL("CREATE VIRTUAL TABLE \"emails\" USING fts4(\"subject\", \"body\", tokenize=\"SQLite.swift\" 'tokenizer')")
55+
56+
emails.insert(subject <- "Aún más cáfe!")!
57+
58+
XCTAssertEqual(1, emails.filter(match("aun", emails)).count)
59+
}
60+
3561
}

SQLite.xcodeproj/project.pbxproj

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
/* Begin PBXBuildFile section */
1010
30C4A0551A8F5ADC00A6F5E8 /* libsqlite3.dylib in Frameworks */ = {isa = PBXBuildFile; fileRef = DC37744219C8DC91004FCF85 /* libsqlite3.dylib */; };
1111
DC0FA83219D87CA3009F3A35 /* SQLite-Bridging-Header.h in Headers */ = {isa = PBXBuildFile; fileRef = DC0FA83119D87CA3009F3A35 /* SQLite-Bridging-Header.h */; settings = {ATTRIBUTES = (Private, ); }; };
12-
DC0FA83719D87E0C009F3A35 /* SQLite-Bridging.c in Sources */ = {isa = PBXBuildFile; fileRef = DC0FA83519D87E0C009F3A35 /* SQLite-Bridging.c */; };
12+
DC0FA83719D87E0C009F3A35 /* SQLite-Bridging.m in Sources */ = {isa = PBXBuildFile; fileRef = DC0FA83519D87E0C009F3A35 /* SQLite-Bridging.m */; };
1313
DC0FA83919D87E0C009F3A35 /* SQLite-Bridging.h in Headers */ = {isa = PBXBuildFile; fileRef = DC0FA83619D87E0C009F3A35 /* SQLite-Bridging.h */; settings = {ATTRIBUTES = (Private, ); }; };
1414
DC109CE11A0C4D970070988E /* Schema.swift in Sources */ = {isa = PBXBuildFile; fileRef = DC109CE01A0C4D970070988E /* Schema.swift */; };
1515
DC109CE41A0C4F5D0070988E /* SchemaTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = DC109CE31A0C4F5D0070988E /* SchemaTests.swift */; };
@@ -20,6 +20,8 @@
2020
DC37743B19C8D6C0004FCF85 /* Statement.swift in Sources */ = {isa = PBXBuildFile; fileRef = DC37743A19C8D6C0004FCF85 /* Statement.swift */; };
2121
DC475EA219F219AF00788FBD /* ExpressionTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = DC475E9E19F2199900788FBD /* ExpressionTests.swift */; };
2222
DC650B9619F0CDC3002FBE91 /* Expression.swift in Sources */ = {isa = PBXBuildFile; fileRef = DC650B9519F0CDC3002FBE91 /* Expression.swift */; };
23+
DC9D389C1AAD458500780AE7 /* fts3_tokenizer.h in Headers */ = {isa = PBXBuildFile; fileRef = DC9D389B1AAD458500780AE7 /* fts3_tokenizer.h */; };
24+
DC9D389D1AAD458500780AE7 /* fts3_tokenizer.h in Headers */ = {isa = PBXBuildFile; fileRef = DC9D389B1AAD458500780AE7 /* fts3_tokenizer.h */; };
2325
DCAD429719E2E0F1004A51DF /* Query.swift in Sources */ = {isa = PBXBuildFile; fileRef = DCAD429619E2E0F1004A51DF /* Query.swift */; };
2426
DCAD429A19E2EE50004A51DF /* QueryTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = DCAD429919E2EE50004A51DF /* QueryTests.swift */; };
2527
DCAFEAD31AABC818000C21A1 /* FTS.swift in Sources */ = {isa = PBXBuildFile; fileRef = DCAFEAD21AABC818000C21A1 /* FTS.swift */; };
@@ -34,7 +36,7 @@
3436
DCC6B3721A9191C300734B78 /* Database.swift in Sources */ = {isa = PBXBuildFile; fileRef = DC37743419C8D626004FCF85 /* Database.swift */; };
3537
DCC6B3731A9191C300734B78 /* Value.swift in Sources */ = {isa = PBXBuildFile; fileRef = DC37743719C8D693004FCF85 /* Value.swift */; };
3638
DCC6B3741A9191C300734B78 /* Schema.swift in Sources */ = {isa = PBXBuildFile; fileRef = DC109CE01A0C4D970070988E /* Schema.swift */; };
37-
DCC6B3751A9191C300734B78 /* SQLite-Bridging.c in Sources */ = {isa = PBXBuildFile; fileRef = DC0FA83519D87E0C009F3A35 /* SQLite-Bridging.c */; };
39+
DCC6B3751A9191C300734B78 /* SQLite-Bridging.m in Sources */ = {isa = PBXBuildFile; fileRef = DC0FA83519D87E0C009F3A35 /* SQLite-Bridging.m */; };
3840
DCC6B3771A9191C300734B78 /* libsqlite3.dylib in Frameworks */ = {isa = PBXBuildFile; fileRef = DC37744219C8DC91004FCF85 /* libsqlite3.dylib */; };
3941
DCC6B3791A9191C300734B78 /* SQLite.swift.h in Headers */ = {isa = PBXBuildFile; fileRef = DC3773F819C8CBB3004FCF85 /* SQLite.swift.h */; settings = {ATTRIBUTES = (Public, ); }; };
4042
DCC6B37A1A9191C300734B78 /* SQLite-Bridging-Header.h in Headers */ = {isa = PBXBuildFile; fileRef = DC0FA83119D87CA3009F3A35 /* SQLite-Bridging-Header.h */; settings = {ATTRIBUTES = (Private, ); }; };
@@ -85,7 +87,7 @@
8587

8688
/* Begin PBXFileReference section */
8789
DC0FA83119D87CA3009F3A35 /* SQLite-Bridging-Header.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "SQLite-Bridging-Header.h"; sourceTree = "<group>"; };
88-
DC0FA83519D87E0C009F3A35 /* SQLite-Bridging.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "SQLite-Bridging.c"; sourceTree = "<group>"; };
90+
DC0FA83519D87E0C009F3A35 /* SQLite-Bridging.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "SQLite-Bridging.m"; sourceTree = "<group>"; };
8991
DC0FA83619D87E0C009F3A35 /* SQLite-Bridging.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "SQLite-Bridging.h"; sourceTree = "<group>"; };
9092
DC109CE01A0C4D970070988E /* Schema.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; lineEnding = 0; path = Schema.swift; sourceTree = "<group>"; xcLanguageSpecificationIdentifier = xcode.lang.swift; };
9193
DC109CE31A0C4F5D0070988E /* SchemaTests.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = SchemaTests.swift; sourceTree = "<group>"; };
@@ -95,15 +97,16 @@
9597
DC3773FE19C8CBB3004FCF85 /* SQLite Tests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = "SQLite Tests.xctest"; sourceTree = BUILT_PRODUCTS_DIR; };
9698
DC37740419C8CBB3004FCF85 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
9799
DC37742E19C8CE67004FCF85 /* SQLite.xcconfig */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.xcconfig; path = SQLite.xcconfig; sourceTree = "<group>"; };
98-
DC37743419C8D626004FCF85 /* Database.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Database.swift; sourceTree = "<group>"; };
100+
DC37743419C8D626004FCF85 /* Database.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; lineEnding = 0; path = Database.swift; sourceTree = "<group>"; xcLanguageSpecificationIdentifier = xcode.lang.swift; };
99101
DC37743719C8D693004FCF85 /* Value.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; lineEnding = 0; path = Value.swift; sourceTree = "<group>"; xcLanguageSpecificationIdentifier = xcode.lang.swift; };
100-
DC37743A19C8D6C0004FCF85 /* Statement.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Statement.swift; sourceTree = "<group>"; };
102+
DC37743A19C8D6C0004FCF85 /* Statement.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; lineEnding = 0; path = Statement.swift; sourceTree = "<group>"; xcLanguageSpecificationIdentifier = xcode.lang.swift; };
101103
DC37744219C8DC91004FCF85 /* libsqlite3.dylib */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libsqlite3.dylib; path = usr/lib/libsqlite3.dylib; sourceTree = SDKROOT; };
102104
DC37744719C8F50B004FCF85 /* README.md */ = {isa = PBXFileReference; lastKnownFileType = net.daringfireball.markdown; path = README.md; sourceTree = "<group>"; };
103105
DC3F170F1A8127A300C83A2F /* Functions.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Functions.swift; sourceTree = "<group>"; };
104106
DC3F17121A814F7000C83A2F /* FunctionsTests.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = FunctionsTests.swift; sourceTree = "<group>"; };
105107
DC475E9E19F2199900788FBD /* ExpressionTests.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ExpressionTests.swift; sourceTree = "<group>"; };
106108
DC650B9519F0CDC3002FBE91 /* Expression.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; lineEnding = 0; path = Expression.swift; sourceTree = "<group>"; xcLanguageSpecificationIdentifier = xcode.lang.swift; };
109+
DC9D389B1AAD458500780AE7 /* fts3_tokenizer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fts3_tokenizer.h; sourceTree = "<group>"; };
107110
DCAAE66D19D8A71B00158FEF /* SQLite.playground */ = {isa = PBXFileReference; lastKnownFileType = file.playground; path = SQLite.playground; sourceTree = "<group>"; };
108111
DCAD429619E2E0F1004A51DF /* Query.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; lineEnding = 0; path = Query.swift; sourceTree = "<group>"; xcLanguageSpecificationIdentifier = xcode.lang.swift; };
109112
DCAD429919E2EE50004A51DF /* QueryTests.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = QueryTests.swift; sourceTree = "<group>"; };
@@ -176,7 +179,7 @@
176179
children = (
177180
DC0FA83119D87CA3009F3A35 /* SQLite-Bridging-Header.h */,
178181
DC0FA83619D87E0C009F3A35 /* SQLite-Bridging.h */,
179-
DC0FA83519D87E0C009F3A35 /* SQLite-Bridging.c */,
182+
DC0FA83519D87E0C009F3A35 /* SQLite-Bridging.m */,
180183
);
181184
name = Bridging;
182185
sourceTree = "<group>";
@@ -259,9 +262,18 @@
259262
name = "Supporting Files";
260263
sourceTree = "<group>";
261264
};
265+
DC9D38991AAD457000780AE7 /* ext */ = {
266+
isa = PBXGroup;
267+
children = (
268+
DC9D389B1AAD458500780AE7 /* fts3_tokenizer.h */,
269+
);
270+
name = ext;
271+
sourceTree = "<group>";
272+
};
262273
DCC6B3951A91936500734B78 /* Vendor */ = {
263274
isa = PBXGroup;
264275
children = (
276+
DC9D38991AAD457000780AE7 /* ext */,
265277
DCC6B3961A91938F00734B78 /* sqlcipher.xcodeproj */,
266278
);
267279
path = Vendor;
@@ -298,6 +310,7 @@
298310
isa = PBXHeadersBuildPhase;
299311
buildActionMask = 2147483647;
300312
files = (
313+
DC9D389C1AAD458500780AE7 /* fts3_tokenizer.h in Headers */,
301314
DC3773F919C8CBB3004FCF85 /* SQLite.swift.h in Headers */,
302315
DC0FA83219D87CA3009F3A35 /* SQLite-Bridging-Header.h in Headers */,
303316
DC0FA83919D87E0C009F3A35 /* SQLite-Bridging.h in Headers */,
@@ -308,6 +321,7 @@
308321
isa = PBXHeadersBuildPhase;
309322
buildActionMask = 2147483647;
310323
files = (
324+
DC9D389D1AAD458500780AE7 /* fts3_tokenizer.h in Headers */,
311325
DCC6B3791A9191C300734B78 /* SQLite.swift.h in Headers */,
312326
DCC6B37A1A9191C300734B78 /* SQLite-Bridging-Header.h in Headers */,
313327
DCC6B37B1A9191C300734B78 /* SQLite-Bridging.h in Headers */,
@@ -488,7 +502,7 @@
488502
DCAD429719E2E0F1004A51DF /* Query.swift in Sources */,
489503
DC109CE11A0C4D970070988E /* Schema.swift in Sources */,
490504
DCC6B3A81A91975700734B78 /* Functions.swift in Sources */,
491-
DC0FA83719D87E0C009F3A35 /* SQLite-Bridging.c in Sources */,
505+
DC0FA83719D87E0C009F3A35 /* SQLite-Bridging.m in Sources */,
492506
);
493507
runOnlyForDeploymentPostprocessing = 0;
494508
};
@@ -520,7 +534,7 @@
520534
DCC6B3741A9191C300734B78 /* Schema.swift in Sources */,
521535
DCC6B3A91A91975C00734B78 /* Functions.swift in Sources */,
522536
DCAFEAD41AABC818000C21A1 /* FTS.swift in Sources */,
523-
DCC6B3751A9191C300734B78 /* SQLite-Bridging.c in Sources */,
537+
DCC6B3751A9191C300734B78 /* SQLite-Bridging.m in Sources */,
524538
DCBE28421ABDF18F0042A3FC /* RTree.swift in Sources */,
525539
DCC6B3A41A9194A800734B78 /* Cipher.swift in Sources */,
526540
);

SQLite/FTS.swift

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ public func fts4(columns: Expression<String>...) -> Expression<()> {
2929
// TODO: matchinfo, compress, uncompress
3030
public func fts4(columns: [Expression<String>], tokenize tokenizer: Tokenizer? = nil) -> Expression<()> {
3131
var options = [String: String]()
32-
options["tokenize"] = tokenizer?.rawValue
32+
options["tokenize"] = tokenizer?.description
3333
return fts("fts4", columns, options)
3434
}
3535

@@ -41,14 +41,52 @@ private func fts(function: String, columns: [Expression<String>], options: [Stri
4141
return wrap(function, Expression<()>.join(", ", definitions))
4242
}
4343

44-
public enum Tokenizer: String {
44+
public enum Tokenizer {
4545

46-
case Simple = "simple"
46+
internal static var moduleName = "SQLite.swift"
4747

48-
case Porter = "porter"
48+
case Simple
49+
50+
case Porter
51+
52+
case Custom(String)
53+
54+
}
55+
56+
extension Tokenizer: Printable {
57+
58+
public var description: String {
59+
switch self {
60+
case .Simple:
61+
return "simple"
62+
case .Porter:
63+
return "porter"
64+
case .Custom(let tokenizer):
65+
return "\(quote(identifier: Tokenizer.moduleName)) \(quote(literal: tokenizer))"
66+
}
67+
}
4968

5069
}
5170

5271
public func match(string: String, expression: Query) -> Expression<Bool> {
5372
return infix("MATCH", Expression<String>(expression.tableName), Expression<String>(binding: string))
5473
}
74+
75+
extension Database {
76+
77+
public func register(tokenizer submoduleName: String, next: String -> (String, Range<String.Index>)?) {
78+
try {
79+
SQLiteRegisterTokenizer(self.handle, Tokenizer.moduleName, submoduleName) { input, offset, length in
80+
let string = String.fromCString(input)!
81+
if var (token, range) = next(string) {
82+
let view = string.utf8
83+
offset.memory += count(string.substringToIndex(range.startIndex).utf8)
84+
length.memory = Int32(distance(range.startIndex.samePositionIn(view), range.endIndex.samePositionIn(view)))
85+
return token
86+
}
87+
return nil
88+
}
89+
}
90+
}
91+
92+
}

SQLite/SQLite-Bridging.c

Lines changed: 0 additions & 81 deletions
This file was deleted.

SQLite/SQLite-Bridging.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,12 @@
2222
// THE SOFTWARE.
2323
//
2424

25+
@import Foundation;
26+
2527
#include <sqlite3.h>
2628

29+
#include "fts3_tokenizer.h"
30+
2731
typedef int (^SQLiteBusyHandlerCallback)(int times);
2832
int SQLiteBusyHandler(sqlite3 * handle, SQLiteBusyHandlerCallback callback);
2933

@@ -35,3 +39,6 @@ int SQLiteCreateFunction(sqlite3 * handle, const char * name, int argc, int dete
3539

3640
typedef int (^SQLiteCreateCollationCallback)(const char * lhs, const char * rhs);
3741
int SQLiteCreateCollation(sqlite3 * handle, const char * name, SQLiteCreateCollationCallback callback);
42+
43+
typedef NSString * (^SQLiteTokenizerNextCallback)(const char * input, int * inputOffset, int * inputLength);
44+
int SQLiteRegisterTokenizer(sqlite3 * db, const char * module, const char * tokenizer, SQLiteTokenizerNextCallback callback);

0 commit comments

Comments
 (0)