diff --git a/.gitmodules b/.gitmodules index 412666b..ff70a2d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ -[submodule "lightgbm-sys/lightgbm"] +[submodule "lightgbm-upstream"] path = lightgbm-sys/lightgbm - url = https://github.com/vaaaaanquish/LightGBM.git + url = https://github.com/microsoft/LightGBM.git diff --git a/Cargo.toml b/Cargo.toml index 4a3f2b2..6b81c30 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,21 +1,23 @@ [package] name = "lightgbm" -version = "0.2.3" +version = "0.2.5" authors = ["vaaaaanquish <6syun9@gmail.com>"] license = "MIT" repository = "https://github.com/vaaaaanquish/LightGBM" description = "Machine learning using LightGBM" readme = "README.md" exclude = [".gitignore", ".gitmodules", "examples", "lightgbm-sys"] +edition = "2021" [dependencies] -lightgbm-sys = { path = "lightgbm-sys", version = "0.3.0" } -libc = "0.2.81" -derive_builder = "0.5.1" -serde_json = "1.0.59" -polars = {version = "0.16.0", optional = true} +lightgbm-sys = { path = "lightgbm-sys", version = "0.3.1" } +libc = "0.2.169" +derive_builder = "0.20" +serde_json = "1" +polars = {version = "0.45", optional = true} [features] default = [] dataframe = ["polars"] +cuda = ["lightgbm-sys/cuda"] diff --git a/lightgbm-sys/.cargo/config b/lightgbm-sys/.cargo/config.toml similarity index 100% rename from lightgbm-sys/.cargo/config rename to lightgbm-sys/.cargo/config.toml diff --git a/lightgbm-sys/Cargo.toml b/lightgbm-sys/Cargo.toml index 8f837fe..3dae385 100644 --- a/lightgbm-sys/Cargo.toml +++ b/lightgbm-sys/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lightgbm-sys" -version = "0.3.0" +version = "0.3.1" authors = ["vaaaaanquish <6syun9@gmail.com>"] build = "build.rs" license = "MIT" @@ -8,10 +8,14 @@ repository = "https://github.com/vaaaaanquish/LightGBM" description = "Native bindings to the LightGBM library" readme = "README.md" exclude = ["README.md", ".gitlab-ci.yml", ".hgeol", ".gitignore", ".appveyor.yml", ".coveralls.yml", ".travis.yml", ".github", ".gitmodules", ".nuget", "**/*.md", "lightgbm/compute/doc", "lightgbm/compute/example", "lightgbm/compute/index.html", "lightgbm/compute/perf", "lightgbm/compute/test", "lightgbm/eigen/debug", "lightgbm/eigen/demos", "lightgbm/eigen/doc", "lightgbm/eigen/failtest", "lightgbm/eigen/test", "lightgbm/examples", "lightgbm/external_libs/fast_double_parser/benchmarks", "lightgbm/external_libs/fmt/doc", "lightgbm/external_libs/fmt/test"] +edition = "2021" + +[features] +cuda = [] [dependencies] -libc = "0.2.81" +libc = "0.2" [build-dependencies] -bindgen = "0.56.0" +bindgen = "0.71" cmake = "0.1" diff --git a/lightgbm-sys/build.rs b/lightgbm-sys/build.rs index 7c4d7d6..4cd93a4 100644 --- a/lightgbm-sys/build.rs +++ b/lightgbm-sys/build.rs @@ -15,7 +15,7 @@ fn main() { if !lgbm_root.exists() { let status = if target.contains("windows") { Command::new("cmd") - .args(&[ + .args([ "/C", "echo D | xcopy /S /Y lightgbm", lgbm_root.to_str().unwrap(), @@ -23,7 +23,7 @@ fn main() { .status() } else { Command::new("cp") - .args(&["-r", "lightgbm", lgbm_root.to_str().unwrap()]) + .args(["-r", "lightgbm", lgbm_root.to_str().unwrap()]) .status() }; if let Some(err) = status.err() { @@ -36,19 +36,53 @@ fn main() { } // CMake - let dst = Config::new(&lgbm_root) + let mut dst = Config::new(&lgbm_root); + let mut dst = dst .profile("Release") - .uses_cxx11() .define("BUILD_STATIC_LIB", "ON") - .build(); + .define("CMAKE_POSITION_INDEPENDENT_CODE", "ON"); + + #[cfg(feature = "cuda")] + let mut dst = dst.define("USE_CUDA", "1").define("USE_CUDA_EXP", "1"); + + #[cfg(target_os = "macos")] + { + let path = PathBuf::from("/opt/homebrew/"); // check for m1 vs intel config + if let Ok(_dir) = std::fs::read_dir(&path) { + dst = dst + .define("CMAKE_C_COMPILER", "/opt/homebrew/opt/llvm/bin/clang") + .define("CMAKE_CXX_COMPILER", "/opt/homebrew/opt/llvm/bin/clang++") + .define("OPENMP_LIBRARIES", "/opt/homebrew/opt/llvm/lib") + .define("OPENMP_INCLUDES", "/opt/homebrew/opt/llvm/include"); + }; + } + + let dst = dst.build(); // bindgen build let bindings = bindgen::Builder::default() .header("wrapper.h") - .clang_args(&["-x", "c++", "-std=c++11"]) + .opaque_type("std::.*") + .blocklist_type("std::.*") + .opaque_type("size_type") + .allowlist_type("LGBM_.*") + .allowlist_function("LGBM_.*") + .allowlist_type("C_API_.*") + .allowlist_var("C_API_.*") + .clang_args(&["-x", "c++", "-std=c++17", "-flto=thin"]) .clang_arg(format!("-I{}", lgbm_root.join("include").display())) - .generate() - .expect("Unable to generate bindings"); + .parse_callbacks(Box::new(bindgen::CargoCallbacks::new())); + + #[cfg(target_os = "linux")] + let bindings = bindings + .clang_arg(format!("-I/usr/include/c++/17")) + .clang_arg(format!("-I/usr/include/x86_64-linux-gnu/c++/17")); + + #[cfg(feature = "cuda")] + let bindings = bindings.clang_arg("-I/usr/local/cuda/include"); + + let bindings = bindings.generate().expect("Unable to generate bindings"); + let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); bindings .write_to_file(out_path.join("bindings.rs")) @@ -70,4 +104,10 @@ fn main() { } else { println!("cargo:rustc-link-lib=static=_lightgbm"); } + + #[cfg(feature = "cuda")] + { + println!("cargo:rustc-link-search={}", "/usr/local/cuda/lib64"); + println!("cargo:rustc-link-lib=static=cudart_static"); + } } diff --git a/lightgbm-sys/lightgbm b/lightgbm-sys/lightgbm index 3cf764e..e0c34e7 160000 --- a/lightgbm-sys/lightgbm +++ b/lightgbm-sys/lightgbm @@ -1 +1 @@ -Subproject commit 3cf764e438347d1aa12a1f1c464a6295cf7a6134 +Subproject commit e0c34e7b2f68793bbf46854b85fd750be134446d diff --git a/src/booster.rs b/src/booster.rs index bd1732b..94d7dad 100644 --- a/src/booster.rs +++ b/src/booster.rs @@ -18,6 +18,20 @@ impl Booster { Booster { handle } } + /// Initialize model from bytes. + pub fn from_bytes(bytes: &[u8]) -> Result { + let booster_str = CString::new(bytes).unwrap(); + let mut out_num_iteration = 0; + let mut handle = std::ptr::null_mut(); + lgbm_call!(lightgbm_sys::LGBM_BoosterLoadModelFromString( + booster_str.as_ptr().cast(), + &mut out_num_iteration, + &mut handle + ))?; + + Ok(Booster::new(handle)) + } + /// Init from model file. pub fn from_file(filename: &str) -> Result { let filename_str = CString::new(filename).unwrap(); @@ -95,21 +109,22 @@ impl Booster { /// /// Input data example /// ``` - /// let data = vec![vec![1.0, 0.1, 0.2], - /// vec![0.7, 0.4, 0.5], - /// vec![0.1, 0.7, 1.0]]; + /// let data = vec![1.0, 0.1, 0.2, + /// 0.7, 0.4, 0.5, + /// 0.1, 0.7, 1.0]; /// ``` /// /// Output data example /// ``` - /// let output = vec![vec![1.0, 0.109, 0.433]]; + /// let output = vec![1.0, 0.109, 0.433]; /// ``` - pub fn predict(&self, data: Vec>) -> Result>> { - let data_length = data.len(); - let feature_length = data[0].len(); - let params = CString::new("").unwrap(); - let mut out_length: c_longlong = 0; - let flat_data = data.into_iter().flatten().collect::>(); + pub fn predict(&self, data: &[f32], num_features: i32) -> Result> { + let ncol = num_features; + let nrow = data.len() as i32 / ncol; + let is_row_major = 1_i32; + let start_iteration = 0_i32; + let num_iteration = -1_i32; // no limit + let parameters = CString::new("").unwrap(); // get num_class let mut num_class = 0; @@ -118,33 +133,65 @@ impl Booster { &mut num_class ))?; - let out_result: Vec = vec![Default::default(); data_length * num_class as usize]; + let mut out_length: c_longlong = 0; + let out_result: Vec = vec![Default::default(); (nrow * num_class) as usize]; lgbm_call!(lightgbm_sys::LGBM_BoosterPredictForMat( self.handle, - flat_data.as_ptr() as *const c_void, - lightgbm_sys::C_API_DTYPE_FLOAT64 as i32, - data_length as i32, - feature_length as i32, - 1_i32, - 0_i32, - 0_i32, - -1_i32, - params.as_ptr() as *const c_char, + data.as_ptr() as *const c_void, + lightgbm_sys::C_API_DTYPE_FLOAT32 as i32, + nrow, + ncol, + is_row_major, + lightgbm_sys::C_API_PREDICT_NORMAL as i32, + start_iteration, + num_iteration, + parameters.as_ptr() as *const c_char, &mut out_length, - out_result.as_ptr() as *mut c_double + out_result.as_ptr() as *mut c_double, ))?; - // reshape for multiclass [1,2,3,4,5,6] -> [[1,2,3], [4,5,6]] # 3 class - let reshaped_output = if num_class > 1 { - out_result - .chunks(num_class as usize) - .map(|x| x.to_vec()) - .collect() - } else { - vec![out_result] - }; - Ok(reshaped_output) + Ok(out_result) + } + + /// Get number of classes. + pub fn num_class(&self) -> Result { + let mut num_class = 0; + lgbm_call!(lightgbm_sys::LGBM_BoosterGetNumClasses( + self.handle, + &mut num_class + ))?; + + Ok(num_class) + } + + /// Predict for single row. + pub fn predict_row(&self, data: Vec) -> Result> { + let feature_length = data.len(); + let params = CString::new("").unwrap(); + + let mut num_class = 0; + lgbm_call!(lightgbm_sys::LGBM_BoosterGetNumClasses( + self.handle, + &mut num_class + ))?; + let mut out_result = vec![Default::default(); num_class as usize]; + + lgbm_call!(lightgbm_sys::LGBM_BoosterPredictForMatSingleRow( + self.handle, + data.as_ptr().cast(), + lightgbm_sys::C_API_DTYPE_FLOAT64 as _, + feature_length as _, + 1, + lightgbm_sys::C_API_PREDICT_NORMAL as _, + 0, + -1, + params.as_ptr().cast(), + &mut 0, + out_result.as_mut_ptr(), + ))?; + + Ok(out_result) } /// Get Feature Num. @@ -174,7 +221,7 @@ impl Booster { self.handle, feature_name_length as i32, &mut num_feature_names, - num_feature as u64, + num_feature as usize, &mut out_buffer_len, out_strs.as_ptr() as *mut *mut c_char ))?; @@ -226,12 +273,12 @@ mod tests { use std::path::Path; fn _read_train_file() -> Result { - Dataset::from_file(&"lightgbm-sys/lightgbm/examples/binary_classification/binary.train") + Dataset::from_file("lightgbm-sys/lightgbm/examples/binary_classification/binary.train") } fn _train_booster(params: &Value) -> Booster { let dataset = _read_train_file().unwrap(); - Booster::train(dataset, ¶ms).unwrap() + Booster::train(dataset, params).unwrap() } fn _default_params() -> Value { @@ -257,13 +304,35 @@ mod tests { } }; let bst = _train_booster(¶ms); - let feature = vec![vec![0.5; 28], vec![0.0; 28], vec![0.9; 28]]; - let result = bst.predict(feature).unwrap(); + let mut features = Vec::new(); + + for _ in 0..2500 { + features.extend(vec![0.5; 28]); + } + + assert_eq!(features.len(), 28 * 2500); + let result = bst.predict(&features, 28).unwrap(); + assert_eq!(result.len(), 2500); + } + + #[test] + fn predict_single_row() { + let params = json! { + { + "num_iterations": 10, + "objective": "binary", + "metric": "auc", + "data_random_seed": 0 + } + }; + let bst = _train_booster(¶ms); + let feature = vec![0.9; 28]; + let result = bst.predict_row(feature).unwrap(); let mut normalized_result = Vec::new(); - for r in &result[0] { + for r in &result { normalized_result.push(if r > &0.5 { 1 } else { 0 }); } - assert_eq!(normalized_result, vec![0, 0, 1]); + assert_eq!(normalized_result, vec![1]); } #[test] @@ -274,6 +343,14 @@ mod tests { assert_eq!(num_feature, 28); } + #[test] + fn num_class() { + let params = _default_params(); + let bst = _train_booster(¶ms); + let num_class = bst.num_class().unwrap(); + assert_eq!(num_class, 1); + } + #[test] fn feature_importance() { let params = _default_params(); @@ -295,13 +372,13 @@ mod tests { fn save_file() { let params = _default_params(); let bst = _train_booster(¶ms); - assert_eq!(bst.save_file(&"./test/test_save_file.output"), Ok(())); + assert_eq!(bst.save_file("./test/test_save_file.output"), Ok(())); assert!(Path::new("./test/test_save_file.output").exists()); let _ = fs::remove_file("./test/test_save_file.output"); } #[test] fn from_file() { - let _ = Booster::from_file(&"./test/test_from_file.input"); + let _ = Booster::from_file("./test/test_from_file.input"); } } diff --git a/src/dataset.rs b/src/dataset.rs index b6e1d6f..254a8b0 100644 --- a/src/dataset.rs +++ b/src/dataset.rs @@ -37,12 +37,57 @@ pub struct Dataset { pub(crate) handle: lightgbm_sys::DatasetHandle, } -#[link(name = "c")] impl Dataset { fn new(handle: lightgbm_sys::DatasetHandle) -> Self { Self { handle } } + /// Create a new `Dataset` from a dense array in row-major order + /// without allocating rows in memory. + /// + /// Example + /// ``` + /// use lightgbm::Dataset; + /// + /// let data = vec![1.0, 0.1, 0.2, 0.1, + /// 0.7, 0.4, 0.5, 0.1, + /// 0.9, 0.8, 0.5, 0.1, + /// 0.2, 0.2, 0.8, 0.7, + /// 0.1, 0.7, 1.0, 0.9]; + /// let label = vec![0.0, 0.0, 0.0, 1.0, 1.0]; + /// let dataset = Dataset::from_vec(&data, &label, 4).unwrap(); + /// ``` + pub fn from_vec(data: &[f32], labels: &[f32], num_features: i32) -> Result { + let nrows = data.len() as i32 / num_features; + let ncol = num_features; + let is_row_major = 1_i32; // row-major + let parameters = CString::new("").unwrap(); + let label_name = CString::new("label").unwrap(); + + let mut handle = std::ptr::null_mut(); + + lgbm_call!(lightgbm_sys::LGBM_DatasetCreateFromMat( + data.as_ptr() as *const c_void, + lightgbm_sys::C_API_DTYPE_FLOAT32 as i32, + nrows, + ncol, + is_row_major, + parameters.as_ptr() as *const c_char, + std::ptr::null_mut(), + &mut handle + ))?; + + lgbm_call!(lightgbm_sys::LGBM_DatasetSetField( + handle, + label_name.as_ptr() as *const c_char, + labels.as_ptr() as *const c_void, + nrows, + lightgbm_sys::C_API_DTYPE_FLOAT32 as i32 + ))?; + + Ok(Self::new(handle)) + } + /// Create a new `Dataset` from dense array in row-major order. /// /// Example @@ -153,13 +198,14 @@ impl Dataset { let (m, n) = dataframe.shape(); - let label_series = &dataframe.select_series(label_col_name)?[0].cast::()?; + let label_series = + &dataframe.select_series([label_col_name])?[0].cast(&DataType::Float32)?; if label_series.null_count() != 0 { panic!("Cannot create a dataset with null values, encountered nulls when creating the label array") } - dataframe.drop_in_place(label_col_name)?; + let _ = dataframe.drop_in_place(label_col_name)?; let mut label_values = Vec::with_capacity(m); @@ -182,7 +228,7 @@ impl Dataset { panic!("Cannot create a dataset with null values, encountered nulls when creating the features array") } - let series = series.cast::()?; + let series = series.cast(&DataType::Float64)?; let ca = series.unpack::()?; ca.into_no_null_iter() @@ -203,7 +249,7 @@ impl Drop for Dataset { mod tests { use super::*; fn read_train_file() -> Result { - Dataset::from_file(&"lightgbm-sys/lightgbm/examples/binary_classification/binary.train") + Dataset::from_file("lightgbm-sys/lightgbm/examples/binary_classification/binary.train") } #[test] @@ -225,6 +271,19 @@ mod tests { assert!(dataset.is_ok()); } + #[test] + fn from_vec() { + let data = vec![ + 1.0, 0.1, 0.2, 0.1, 0.7, 0.4, 0.5, 0.1, 0.9, 0.8, 0.5, 0.1, 0.2, 0.2, 0.8, 0.7, 0.1, + 0.7, 1.0, 0.9, + ]; + + let labels = vec![0.0, 0.0, 0.0, 1.0, 1.0]; + + let dataset = Dataset::from_vec(&data, &labels, 4); + assert!(dataset.is_ok()); + } + #[cfg(feature = "dataframe")] #[test] fn from_dataframe() { diff --git a/src/lib.rs b/src/lib.rs index 2867d47..60af0b3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,7 +5,6 @@ extern crate serde_json; #[cfg(feature = "dataframe")] extern crate polars; -#[macro_use] macro_rules! lgbm_call { ($x:expr) => { Error::check_return_value(unsafe { $x })