diff --git a/.gitignore b/.gitignore index 3300a23e..8e7167ac 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ Gemfile.lock .bundle vendor +lib/jars diff --git a/.travis.yml b/.travis.yml index 73bc767a..a50fc739 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,9 +1,2 @@ ---- -sudo: false -language: ruby -cache: bundler -rvm: -- jruby-1.7.25 -jdk: oraclejdk8 -script: bundle exec rspec spec --order rand -before_install: [] +import: +- logstash-plugins/.ci:travis/travis.yml@1.x \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 692d0cfa..255c5fad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,16 +1,205 @@ +## 4.4.6 + - Change read mode to immediately stop consuming buffered lines when shutdown is requested [#322](https://github.com/logstash-plugins/logstash-input-file/pull/322) + +## 4.4.5 + - Handle EOF when checking archive validity [#321](https://github.com/logstash-plugins/logstash-input-file/pull/321) + +## 4.4.4 + - Fixes gzip file handling in read mode when run on JDK12+, including JDK17 that is bundled with Logstash 8.4+ [#312](https://github.com/logstash-plugins/logstash-input-file/pull/312) + +## 4.4.3 + - Fixes read mode to restart the read from reference stored in sincedb in case the file wasn't completely consumed. [#307](https://github.com/logstash-plugins/logstash-input-file/pull/307) + +## 4.4.2 + - Doc: Fix attribute by removing extra character [#310](https://github.com/logstash-plugins/logstash-input-file/pull/310) + +## 4.4.1 + - Fix: update to Gradle 7 [#305](https://github.com/logstash-plugins/logstash-input-file/pull/305) + - [DOC] Add version attributes to doc source file [#308](https://github.com/logstash-plugins/logstash-input-file/pull/308) + +## 4.4.0 + - Add support for ECS v8 [#301](https://github.com/logstash-plugins/logstash-input-file/pull/301) + +## 4.3.1 + - Add extra safety to `chown` call in `atomic_write`, avoiding plugin crashes and falling back to a + `non_atomic_write` in the event of failure [#295](https://github.com/logstash-plugins/logstash-input-file/pull/295) + - Refactor: unify event updates to happen in one place [#297](https://github.com/logstash-plugins/logstash-input-file/pull/297) + - Test: Actually retry tests on `RSpec::Expectations::ExpectationNotMetError` and retry instead of relying on timeout + [#297](https://github.com/logstash-plugins/logstash-input-file/pull/297) + +## 4.3.0 + - Add ECS Compatibility Mode [#291](https://github.com/logstash-plugins/logstash-input-file/pull/291) + +## 4.2.4 + - Fix: sincedb_write issue on Windows machines [#283](https://github.com/logstash-plugins/logstash-input-file/pull/283) + +## 4.2.3 + - Refactor: improve debug logging (log catched exceptions) [#280](https://github.com/logstash-plugins/logstash-input-file/pull/280) + +## 4.2.2 + - Fix: sincedb_clean_after not being respected [#276](https://github.com/logstash-plugins/logstash-input-file/pull/276) + +## 4.2.1 + - Fix: skip sincedb eviction if read mode completion deletes file during flush [#273](https://github.com/logstash-plugins/logstash-input-file/pull/273) + +## 4.2.0 + - Fix: watched files performance with huge filesets [#268](https://github.com/logstash-plugins/logstash-input-file/pull/268) + - Updated logging to include full traces in debug (and trace) levels + +## 4.1.18 + - Fix: release watched files on completion (in read-mode) [#271](https://github.com/logstash-plugins/logstash-input-file/pull/271) + +## 4.1.17 + - Added configuration setting `check_archive_validity` settings to enable + gzipped files verification, issue + [#261](https://github.com/logstash-plugins/logstash-input-file/issues/261) + - [DOC] Added clarification for settings available with `read` mode [#235](https://github.com/logstash-plugins/logstash-input-file/pull/235) + - [DOC] Rearranged text and fixed formatting for `mode` setting [266](https://github.com/logstash-plugins/logstash-input-file/pull/266) + +## 4.1.16 + - Added configuration setting exit_after_read to read to EOF and terminate + the input [#240](https://github.com/logstash-plugins/logstash-input-file/pull/240) + +## 4.1.15 + - Fixed bug in conversion of sincedb_clean_after setting [#257](https://github.com/logstash-plugins/logstash-input-file/pull/257) + +## 4.1.14 + - Fixed bug in delete of multiple watched files [#254](https://github.com/logstash-plugins/logstash-input-file/pull/254) + +## 4.1.13 + - Fixed sinceDB to work spaces filename [#249](https://github.com/logstash-plugins/logstash-input-file/pull/249) + +## 4.1.12 + - Fix regression in `exclude` handling. Patterns are matched against the filename, not full path. + [Issue #237](https://github.com/logstash-plugins/logstash-input-file/issues/237) + +## 4.1.11 + - Fixed link to FAQ [#247](https://github.com/logstash-plugins/logstash-input-file/pull/247) + +## 4.1.10 + - Fixed problem in Windows where some paths would fail to return an identifier ("inode"). Make path into a C style String before encoding to UTF-16LE. [#232](https://github.com/logstash-plugins/logstash-input-file/issues/232) + +## 4.1.9 + - Fixed issue where logs were being spammed with needless error messages [#224](https://github.com/logstash-plugins/logstash-input-file/pull/224) + +## 4.1.8 + - Fixed problem in tail and read modes where the read loop could get stuck if an IO error occurs in the loop. + The file appears to be being read but it is not, suspected with file truncation schemes. + [Issue #205](https://github.com/logstash-plugins/logstash-input-file/issues/205) + +## 4.1.7 + - Fixed problem in rotation handling where the target file being rotated was + subjected to the start_position setting when it must always start from the beginning. + [Issue #214](https://github.com/logstash-plugins/logstash-input-file/issues/214) + +## 4.1.6 + - Fixed Errno::ENOENT exception in Discoverer. [Issue #204](https://github.com/logstash-plugins/logstash-input-file/issues/204) + +## 4.1.5 + - Fixed text anchor by changing it from hardcoded to asciidoc reference to + work in versioned plugin reference + +## 4.1.4 + - Fixed a regression where files discovered after first discovery were not + always read from the beginning. Applies to tail mode only. + [#198](https://github.com/logstash-plugins/logstash-input-file/issues/198) + - Added much better support for file rotation schemes of copy/truncate and + rename cascading. Applies to tail mode only. + - Added support for processing files over remote mounts e.g. NFS. Before, it + was possible to read into memory allocated but not filled with data resulting + in ASCII NUL (0) bytes in the message field. Now, files are read up to the + size as given by the remote filesystem client. Applies to tail and read modes. + +## 4.1.3 + - Fixed `read` mode of regular files sincedb write is requested in each read loop + iteration rather than waiting for the end-of-file to be reached. Note: for gz files, + the sincedb entry can only be updated at the end of the file as it is not possible + to seek into a compressed file and begin reading from that position. + [#196](https://github.com/logstash-plugins/logstash-input-file/pull/196) + - Added support for String Durations in some settings e.g. `stat_interval => "750 ms"` + [#194](https://github.com/logstash-plugins/logstash-input-file/pull/194) + +## 4.1.2 + - Fix `require winhelper` error in WINDOWS. + [Issue #184](https://github.com/logstash-plugins/logstash-input-file/issues/184) + - Fix when no delimiter is found in a chunk, the chunk is reread - no forward progress + is made in the file. + [Issue #185](https://github.com/logstash-plugins/logstash-input-file/issues/185) + +## 4.1.1 + - Fix JAR_VERSION read problem, prevented Logstash from starting. + [Issue #180](https://github.com/logstash-plugins/logstash-input-file/issues/180) + - Fix sincedb write error when using /dev/null, repeatedly causes a plugin restart. + [Issue #182](https://github.com/logstash-plugins/logstash-input-file/issues/182) + +## 4.1.0 + - Move Filewatch code into the plugin folder, rework Filewatch code to use + Logstash facilities like logging and environment. + - New feature: `mode` setting. Introduces two modes, `tail` mode is the + existing behaviour for tailing, `read` mode is new behaviour that is + optimized for the read complete content scenario. Please read the docs to + fully appreciate the benefits of `read` mode. + - New feature: File completion actions. Settings `file_completed_action` + and `file_completed_log_path` control what actions to do after a file is + completely read. Applicable: `read` mode only. + - New feature: in `read` mode, compressed files can be processed, GZIP only. + - New feature: Files are sorted after being discovered. Settings `file_sort_by` + and `file_sort_direction` control the sort order. Applicable: any mode. + - New feature: Banded or striped file processing. Settings: `file_chunk_size` + and `file_chunk_count` control banded or striped processing. Applicable: any mode. + - New feature: `sincedb_clean_after` setting. Introduces expiry of sincedb + records. The default is 14 days. If, after `sincedb_clean_after` days, no + activity has been detected on a file (inode) the record expires and is not + written to disk. The persisted record now includes the "last activity seen" + timestamp. Applicable: any mode. + - Docs: extensive additions to introduce the new features. + +## 4.0.5 + - Docs: Set the default_codec doc attribute. + +## 4.0.4 + - Update gemspec summary + +## 4.0.3 + - Fix some documentation issues + +## 4.0.1 + - Docs: Fix the description with the logstash documentation generator + - Fix an issue with the rspec suite not finding log4j + +## 4.0.0 + - Breaking: `ignore_older` settings is disabled by default. Previously if the file was older than + 24 hours (the default for ignore_older), it would be ignored. This confused new users a lot, specially + when they were reading new files with Logstash (with `start_position => beginning`). This setting also + makes it consistent with Filebeat. + +## 3.1.2 + - Adjust a few log call levels + +## 3.1.1 + - Add host to @metadata + +## 3.1.0 + - Breaking: Use native `--path.data` for Logstash 5.0 for sincedb files. + ## 3.0.3 - Relax constraint on logstash-core-plugin-api to >= 1.60 <= 2.99 ## 3.0.2 - relax constrains of `logstash-devutils` see https://github.com/elastic/logstash-devutils/issues/48 + ## 3.0.1 - Republish all the gems under jruby. + ## 3.0.0 - Update the plugin to the version 2.0 of the plugin api, this change is required for Logstash 5.0 compatibility. See https://github.com/elastic/logstash/issues/5141 + # 2.2.5 - Depend on logstash-core-plugin-api instead of logstash-core, removing the need to mass update plugins on major releases of logstash + # 2.2.3 - New dependency requirements for logstash-core for the 5.0 release + ## 2.2.2 - Fix for: Filewatch library complains if HOME or SINCEDB_PATH variables are unset. - [Issue #101](https://github.com/logstash-plugins/logstash-input-file/issues/101) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index ea055901..d89064c2 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -18,6 +18,7 @@ Contributors: * elliot moore (em295) * yjpa7145 * Guy Boertje (guyboertje) +* Aaron Mildenstein (untergeek) Note: If you've sent us patches, bug reports, or otherwise contributed to Logstash, and you aren't on the list above and want to be, please let us know diff --git a/Gemfile b/Gemfile index 2b03d18e..32cc6fbb 100644 --- a/Gemfile +++ b/Gemfile @@ -1,4 +1,11 @@ source 'https://rubygems.org' -# Specify your gem's dependencies in logstash-mass_effect.gemspec gemspec + +logstash_path = ENV["LOGSTASH_PATH"] || "../../logstash" +use_logstash_source = ENV["LOGSTASH_SOURCE"] && ENV["LOGSTASH_SOURCE"].to_s == "1" + +if Dir.exist?(logstash_path) && use_logstash_source + gem 'logstash-core', :path => "#{logstash_path}/logstash-core" + gem 'logstash-core-plugin-api', :path => "#{logstash_path}/logstash-core-plugin-api" +end diff --git a/JAR_VERSION b/JAR_VERSION new file mode 100644 index 00000000..7dea76ed --- /dev/null +++ b/JAR_VERSION @@ -0,0 +1 @@ +1.0.1 diff --git a/LICENSE b/LICENSE index 43976b73..a80a3fd5 100644 --- a/LICENSE +++ b/LICENSE @@ -1,13 +1,202 @@ -Copyright (c) 2012–2016 Elasticsearch -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ - http://www.apache.org/licenses/LICENSE-2.0 + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2020 Elastic and contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index a48c73a9..7c81a581 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,6 @@ # Logstash Plugin Travis Build -[![Travis Build Status](https://travis-ci.org/logstash-plugins/logstash-input-file.svg)](https://travis-ci.org/logstash-plugins/logstash-input-file) - -Jenkins Build -[![Travis Build Status](https://travis-ci.org/logstash-plugins/logstash-input-file.svg)](https://travis-ci.org/logstash-plugins/logstash-input-file) +[![Travis Build Status](https://travis-ci.com/logstash-plugins/logstash-input-file.svg)](https://travis-ci.com/logstash-plugins/logstash-input-file) This is a plugin for [Logstash](https://github.com/elastic/logstash). @@ -40,6 +37,11 @@ bundle install ```sh bundle install +``` + + - Build the jar library used for watching files +```bash +./gradlew build ``` - Run tests diff --git a/Rakefile b/Rakefile index 4f4b8586..ca92ecf9 100644 --- a/Rakefile +++ b/Rakefile @@ -5,3 +5,9 @@ task :default do end require "logstash/devutils/rake" + +desc "Compile and put filewatch jar into lib/jars" +task :vendor do + exit(1) unless system './gradlew --no-daemon clean jar' + puts "-------------------> built filewatch jar via rake" +end diff --git a/build.gradle b/build.gradle new file mode 100644 index 00000000..a0d33cbd --- /dev/null +++ b/build.gradle @@ -0,0 +1,93 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import java.nio.file.Files +import static java.nio.file.StandardCopyOption.REPLACE_EXISTING + +plugins { + id 'java' + id 'distribution' + id 'idea' +} + +group = 'org.logstash.filewatch' +version file("JAR_VERSION").text.replaceAll("\\s","") + +repositories { + mavenCentral() +} + +java { + sourceCompatibility = JavaVersion.VERSION_1_8 + targetCompatibility = JavaVersion.VERSION_1_8 + withSourcesJar() + withJavadocJar() +} + +dependencies { + compileOnly group: 'org.jruby', name: 'jruby-complete', version: "9.1.13.0" +} + +task copyGemjar(type: Copy, dependsOn: sourcesJar) { + from project.jar + into project.file('lib/jars/') +} + +task cleanGemjar { + delete fileTree(project.file('lib/jars/')) { + include '*.jar' + } +} + +clean.dependsOn(cleanGemjar) +jar.finalizedBy(copyGemjar) + + +task generateGemJarRequiresFile { + doLast { + File jars_file = file('lib/logstash-input-file_jars.rb') + jars_file.newWriter().withWriter { w -> + w << "# AUTOGENERATED BY THE GRADLE SCRIPT. DO NOT EDIT.\n\n" + w << "require \'jar_dependencies\'\n" + configurations.runtimeClasspath.allDependencies.each { + w << "require_jar(\'${it.group}\', \'${it.name}\', \'${it.version}\')\n" + } + w << "\nrequire_jar(\'${project.group}\', \'${project.name}\', \'${project.version}\')\n" + } + } +} + +task vendor { + doLast { + String vendorPathPrefix = "vendor/jar-dependencies" + configurations.runtimeClasspath.allDependencies.each { dep -> + File f = configurations.runtimeClasspath.filter { it.absolutePath.contains("${dep.group}/${dep.name}/${dep.version}") }.singleFile + String groupPath = dep.group.replaceAll('\\.', '/') + File newJarFile = file("${vendorPathPrefix}/${groupPath}/${dep.name}/${dep.version}/${dep.name}-${dep.version}.jar") + newJarFile.mkdirs() + Files.copy(f.toPath(), newJarFile.toPath(), REPLACE_EXISTING) + } + String projectGroupPath = project.group.replaceAll('\\.', '/') + File projectJarFile = file("${vendorPathPrefix}/${projectGroupPath}/${project.name}/${project.version}/${project.name}-${project.version}.jar") + projectJarFile.mkdirs() + Files.copy(file("$buildDir/libs/${project.name}-${project.version}.jar").toPath(), projectJarFile.toPath(), REPLACE_EXISTING) + } +} + +vendor.dependsOn(jar, generateGemJarRequiresFile) \ No newline at end of file diff --git a/docs/index.asciidoc b/docs/index.asciidoc new file mode 100644 index 00000000..159f7d0d --- /dev/null +++ b/docs/index.asciidoc @@ -0,0 +1,536 @@ +:plugin: file +:type: input +:default_codec: plain + +/////////////////////////////////////////// +START - GENERATED VARIABLES, DO NOT EDIT! +/////////////////////////////////////////// +:version: %VERSION% +:release_date: %RELEASE_DATE% +:changelog_url: %CHANGELOG_URL% +:include_path: ../../../../logstash/docs/include +/////////////////////////////////////////// +END - GENERATED VARIABLES, DO NOT EDIT! +/////////////////////////////////////////// + +[id="plugins-{type}s-{plugin}"] + +=== File input plugin + +include::{include_path}/plugin_header.asciidoc[] + +==== Description + +Stream events from files, normally by tailing them in a manner +similar to `tail -0F` but optionally reading them from the +beginning. + +Normally, logging will add a newline to the end of each line written. +By default, each event is assumed to be one line +and a line is taken to be the text before a newline character. +If you would like to join multiple log lines into one event, +you'll want to use the multiline codec. +The plugin loops between discovering new files and processing +each discovered file. Discovered files have a lifecycle, they start off +in the "watched" or "ignored" state. Other states in the lifecycle are: +"active", "closed" and "unwatched" + +By default, a window of 4095 files is used to limit the number of file handles in use. +The processing phase has a number of stages: + +* Checks whether "closed" or "ignored" files have changed in size since last time and +if so puts them in the "watched" state. +* Selects enough "watched" files to fill the available space in the window, these files +are made "active". +* The active files are opened and read, each file is read from the last known position +to the end of current content (EOF) by default. + +In some cases it is useful to be able to control which files are read first, sorting, +and whether files are read completely or banded/striped. +Complete reading is *all of* file A then file B then file C and so on. +Banded or striped reading is *some of* file A then file B then file C and so on looping around +to file A again until all files are read. Banded reading is specified by changing +<> and perhaps <>. +Banding and sorting may be useful if you want some events from all files to appear +in Kibana as early as possible. + +The plugin has two modes of operation, Tail mode and Read mode. + +===== Tail mode + +In this mode the plugin aims to track changing files and emit new content as it's +appended to each file. In this mode, files are seen as a never ending stream of +content and EOF has no special significance. The plugin always assumes that +there will be more content. When files are rotated, the smaller or zero size is +detected, the current position is reset to zero and streaming continues. +A delimiter must be seen before the accumulated characters can be emitted as a line. + +===== Read mode + +In this mode the plugin treats each file as if it is content complete, that is, +a finite stream of lines and now EOF is significant. A last delimiter is not +needed because EOF means that the accumulated characters can be emitted as a line. +Further, EOF here means that the file can be closed and put in the "unwatched" +state - this automatically frees up space in the active window. This mode also +makes it possible to process compressed files as they are content complete. +Read mode also allows for an action to take place after processing the file completely. + +In the past attempts to simulate a Read mode while still assuming infinite streams +was not ideal and a dedicated Read mode is an improvement. + +[id="plugins-{type}s-{plugin}-ecs"] +==== Compatibility with the Elastic Common Schema (ECS) + +This plugin adds metadata about event's source, and can be configured to do so +in an https://www.elastic.co/guide/en/ecs/{ecs_version}/index.html[ECS-compatible] way with <>. +This metadata is added after the event has been decoded by the appropriate codec, +and will never overwrite existing values. + +|======== +| ECS Disabled | ECS `v1`, `v8` | Description + +| `host` | `[host][name]` | The name of the {ls} host that processed the event +| `path` | `[log][file][path]` | The full path to the log file from which the event originates +|======== + +==== Tracking of current position in watched files + +The plugin keeps track of the current position in each file by +recording it in a separate file named sincedb. This makes it +possible to stop and restart Logstash and have it pick up where it +left off without missing the lines that were added to the file while +Logstash was stopped. + +By default, the sincedb file is placed in the data directory of Logstash +with a filename based on the filename patterns being watched (i.e. the `path` option). +Thus, changing the filename patterns will result in a new sincedb file being used and +any existing current position state will be lost. If you change your patterns +with any frequency it might make sense to explicitly choose a sincedb path +with the `sincedb_path` option. + +A different `sincedb_path` must be used for each input. Using the same +path will cause issues. The read checkpoints for each input must be +stored in a different path so the information does not override. + +Files are tracked via an identifier. This identifier is made up of the +inode, major device number and minor device number. In windows, a different +identifier is taken from a `kernel32` API call. + +Sincedb records can now be expired meaning that read positions of older files +will not be remembered after a certain time period. File systems may need to reuse +inodes for new content. Ideally, we would not use the read position of old content, +but we have no reliable way to detect that inode reuse has occurred. This is more +relevant to Read mode where a great many files are tracked in the sincedb. +Bear in mind though, if a record has expired, a previously seen file will be read again. + +Sincedb files are text files with four (< v5.0.0), five or six columns: + +. The inode number (or equivalent). +. The major device number of the file system (or equivalent). +. The minor device number of the file system (or equivalent). +. The current byte offset within the file. +. The last active timestamp (a floating point number) +. The last known path that this record was matched to (for +old sincedb records converted to the new format, this is blank. + +On non-Windows systems you can obtain the inode number of a file +with e.g. `ls -li`. + +==== Reading from remote network volumes + +The file input is not thoroughly tested on remote filesystems such as NFS, +Samba, s3fs-fuse, etc, however NFS is occasionally tested. The file size as given by +the remote FS client is used to govern how much data to read at any given time to +prevent reading into allocated but yet unfilled memory. +As we use the device major and minor in the identifier to track "last read" +positions of files and on remount the device major and minor can change, the +sincedb records may not match across remounts. +Read mode might not be suitable for remote filesystems as the file size at +discovery on the client side may not be the same as the file size on the remote side +due to latency in the remote to client copy process. + +==== File rotation in Tail mode + +File rotation is detected and handled by this input, regardless of +whether the file is rotated via a rename or a copy operation. To +support programs that write to the rotated file for some time after +the rotation has taken place, include both the original filename and +the rotated filename (e.g. /var/log/syslog and /var/log/syslog.1) in +the filename patterns to watch (the `path` option). +For a rename, the inode will be detected as having moved from +`/var/log/syslog` to `/var/log/syslog.1` and so the "state" is moved +internally too, the old content will not be reread but any new content +on the renamed file will be read. +For copy/truncate the copied content into a new file path, if discovered, will +be treated as a new discovery and be read from the beginning. The copied file +paths should therefore not be in the filename patterns to watch (the `path` option). +The truncation will be detected and the "last read" position updated to zero. + +[id="plugins-{type}s-{plugin}-options"] +==== File Input Configuration Options + +This plugin supports the following configuration options plus the <> described later. + +[NOTE] +Duration settings can be specified in text form e.g. "250 ms", this string will be converted into +decimal seconds. There are quite a few supported natural and abbreviated durations, +see <> for the details. + +[cols="<,<,<",options="header",] +|======================================================================= +|Setting |Input type|Required +| <> |<>|No +| <> |<> or <>|No +| <> |<>|No +| <> |<>|No +| <> |<>|No +| <> |<>|No +| <> |<>|No +| <> |<>|No +| <> |<>|No +| <> |<>, one of `["delete", "log", "log_and_delete"]`|No +| <> |<>|No +| <> |<>, one of `["last_modified", "path"]`|No +| <> |<>, one of `["asc", "desc"]`|No +| <> |<> or <>|No +| <> |<>|No +| <> |<>, one of `["tail", "read"]`|No +| <> |<>|Yes +| <> |<> or <>|No +| <> |<>|No +| <> |<> or <>|No +| <> |<>, one of `["beginning", "end"]`|No +| <> |<> or <>|No +|======================================================================= + +Also see <> for a list of options supported by all +input plugins. + +  + +[id="plugins-{type}s-{plugin}-check_archive_validity"] +===== `check_archive_validity` + + * Value type is <> + * The default is `false`. + +When set to `true`, this setting verifies that a compressed file is valid before +processing it. There are two passes through the file--one pass to +verify that the file is valid, and another pass to process the file. + +Validating a compressed file requires more processing time, but can prevent a +corrupt archive from causing looping. + + +[id="plugins-{type}s-{plugin}-close_older"] +===== `close_older` + + * Value type is <> or <> + * Default value is `"1 hour"` + +The file input closes any files that were last read the specified +duration (seconds if a number is specified) ago. +This has different implications depending on if a file is being tailed or +read. If tailing, and there is a large time gap in incoming data the file +can be closed (allowing other files to be opened) but will be queued for +reopening when new data is detected. If reading, the file will be closed +after closed_older seconds from when the last bytes were read. +This setting is retained for backward compatibility if you upgrade the +plugin to 4.1.0+, are reading not tailing and do not switch to using Read mode. + +[id="plugins-{type}s-{plugin}-delimiter"] +===== `delimiter` + + * Value type is <> + * Default value is `"\n"` + +set the new line delimiter, defaults to "\n". Note that when reading compressed files +this setting is not used, instead the standard Windows or Unix line endings are used. + +[id="plugins-{type}s-{plugin}-discover_interval"] +===== `discover_interval` + + * Value type is <> + * Default value is `15` + +How often we expand the filename patterns in the `path` option to discover new files to watch. +This value is a multiple to `stat_interval`, e.g. if `stat_interval` is "500 ms" then new files +files could be discovered every 15 X 500 milliseconds - 7.5 seconds. +In practice, this will be the best case because the time taken to read new content needs to be factored in. + +[id="plugins-{type}s-{plugin}-ecs_compatibility"] +===== `ecs_compatibility` + +* Value type is <> +* Supported values are: +** `disabled`: sets non-ECS metadata on event (such as top-level `host`, `path`) +** `v1`,`v8`: sets ECS-compatible metadata on event (such as `[host][name]`, `[log][file][path]`) +* Default value depends on which version of Logstash is running: +** When Logstash provides a `pipeline.ecs_compatibility` setting, its value is used as the default +** Otherwise, the default value is `disabled`. + +Controls this plugin's compatibility with the +https://www.elastic.co/guide/en/ecs/{ecs_version}/index.html[Elastic Common Schema (ECS)]. + +[id="plugins-{type}s-{plugin}-exclude"] +===== `exclude` + + * Value type is <> + * There is no default value for this setting. + +Exclusions (matched against the filename, not full path). Filename +patterns are valid here, too. For example, if you have +[source,ruby] + path => "/var/log/*" + +In Tail mode, you might want to exclude gzipped files: +[source,ruby] + exclude => "*.gz" + +[id="plugins-{type}s-{plugin}-exit_after_read"] +===== `exit_after_read` + + * Value type is <> + * Default value is `false` + +This option can be used in `read` mode to enforce closing all watchers when file gets read. +Can be used in situation when content of the file is static and won't change during execution. +When set to `true` it also disables active discovery of the files - only files that were in +the directories when process was started will be read. +It supports `sincedb` entries. When file was processed once, then modified - next run will only +read newly added entries. + +[id="plugins-{type}s-{plugin}-file_chunk_count"] +===== `file_chunk_count` + + * Value type is <> + * Default value is `4611686018427387903` + +When combined with the `file_chunk_size`, this option sets how many chunks (bands or stripes) +are read from each file before moving to the next active file. +For example, a `file_chunk_count` of 32 and a `file_chunk_size` 32KB will process the next 1MB from each active file. +As the default is very large, the file is effectively read to EOF before moving to the next active file. + +[id="plugins-{type}s-{plugin}-file_chunk_size"] +===== `file_chunk_size` + + * Value type is <> + * Default value is `32768` (32KB) + +File content is read off disk in blocks or chunks and lines are extracted from the chunk. +See <> to see why and when to change this setting +from the default. + +[id="plugins-{type}s-{plugin}-file_completed_action"] +===== `file_completed_action` + + * Value can be any of: `delete`, `log`, `log_and_delete` + * The default is `delete`. + +When in `read` mode, what action should be carried out when a file is done with. +If 'delete' is specified then the file will be deleted. If 'log' is specified +then the full path of the file is logged to the file specified in the +`file_completed_log_path` setting. If `log_and_delete` is specified then +both above actions take place. + +[id="plugins-{type}s-{plugin}-file_completed_log_path"] +===== `file_completed_log_path` + + * Value type is <> + * There is no default value for this setting. + +Which file should the completely read file paths be appended to. Only specify +this path to a file when `file_completed_action` is 'log' or 'log_and_delete'. +IMPORTANT: this file is appended to only - it could become very large. You are +responsible for file rotation. + +[id="plugins-{type}s-{plugin}-file_sort_by"] +===== `file_sort_by` + + * Value can be any of: `last_modified`, `path` + * The default is `last_modified`. + +Which attribute of a "watched" file should be used to sort them by. +Files can be sorted by modified date or full path alphabetic. +Previously the processing order of the discovered and therefore +"watched" files was OS dependent. + +[id="plugins-{type}s-{plugin}-file_sort_direction"] +===== `file_sort_direction` + + * Value can be any of: `asc`, `desc` + * The default is `asc`. + +Select between ascending and descending order when sorting "watched" files. +If oldest data first is important then the defaults of `last_modified` + `asc` are good. +If newest data first is more important then opt for `last_modified` + `desc`. +If you use special naming conventions for the file full paths then perhaps +`path` + `asc` will help to control the order of file processing. + +[id="plugins-{type}s-{plugin}-ignore_older"] +===== `ignore_older` + + * Value type is <> or <> + * There is no default value for this setting. + +When the file input discovers a file that was last modified +before the specified duration (seconds if a number is specified), the file is ignored. +After it's discovery, if an ignored file is modified it is no +longer ignored and any new data is read. By default, this option is +disabled. Note this unit is in seconds. + +[id="plugins-{type}s-{plugin}-max_open_files"] +===== `max_open_files` + + * Value type is <> + * There is no default value for this setting. + +What is the maximum number of file_handles that this input consumes +at any one time. Use close_older to close some files if you need to +process more files than this number. This should not be set to the +maximum the OS can do because file handles are needed for other +LS plugins and OS processes. +A default of 4095 is set in internally. + +[id="plugins-{type}s-{plugin}-mode"] +===== `mode` + + * Value can be either `tail` or `read`. + * The default value is `tail`. + +What mode do you want the file input to operate in. Tail a few files or +read many content-complete files. Read mode now supports gzip file processing. + +If `read` is specified, these settings can be used: + +* `ignore_older` (older files are not processed) +* `file_completed_action` (what action should be taken when the file is processed) +* `file_completed_log_path` (which file should the completed file path be logged to) + +If `read` is specified, these settings are ignored: + +* `start_position` (files are always read from the beginning) +* `close_older` (files are automatically 'closed' when EOF is reached) + +[id="plugins-{type}s-{plugin}-path"] +===== `path` + + * This is a required setting. + * Value type is <> + * There is no default value for this setting. + +The path(s) to the file(s) to use as an input. +You can use filename patterns here, such as `/var/log/*.log`. +If you use a pattern like `/var/log/**/*.log`, a recursive search +of `/var/log` will be done for all `*.log` files. +Paths must be absolute and cannot be relative. + +You may also configure multiple paths. See an example +on the https://www.elastic.co/guide/en/logstash/{branch}/configuration-file-structure.html#array[Logstash configuration page]. + +[id="plugins-{type}s-{plugin}-sincedb_clean_after"] +===== `sincedb_clean_after` + + * Value type is <> or <> + * The default value for this setting is "2 weeks". + * If a number is specified then it is interpreted as *days* and can be decimal e.g. 0.5 is 12 hours. + +The sincedb record now has a last active timestamp associated with it. +If no changes are detected in a tracked file in the last N days its sincedb +tracking record expires and will not be persisted. +This option helps protect against the inode recycling problem. +Filebeat has an https://www.elastic.co/guide/en/beats/filebeat/{branch}/inode-reuse-issue.html[FAQ about inode recycling]. + +[id="plugins-{type}s-{plugin}-sincedb_path"] +===== `sincedb_path` + + * Value type is <> + * There is no default value for this setting. + +Path of the sincedb database file (keeps track of the current +position of monitored log files) that will be written to disk. +The default will write sincedb files to `/plugins/inputs/file` +NOTE: it must be a file path and not a directory path + +[id="plugins-{type}s-{plugin}-sincedb_write_interval"] +===== `sincedb_write_interval` + + * Value type is <> or <> + * Default value is `"15 seconds"` + +How often (in seconds) to write a since database with the current position of +monitored log files. + +[id="plugins-{type}s-{plugin}-start_position"] +===== `start_position` + + * Value can be any of: `beginning`, `end` + * Default value is `"end"` + +Choose where Logstash starts initially reading files: at the beginning or +at the end. The default behavior treats files like live streams and thus +starts at the end. If you have old data you want to import, set this +to 'beginning'. + +This option only modifies "first contact" situations where a file +is new and not seen before, i.e. files that don't have a current +position recorded in a sincedb file read by Logstash. If a file +has already been seen before, this option has no effect and the +position recorded in the sincedb file will be used. + +[id="plugins-{type}s-{plugin}-stat_interval"] +===== `stat_interval` + + * Value type is <> or <> + * Default value is `"1 second"` + +How often (in seconds) we stat files to see if they have been modified. +Increasing this interval will decrease the number of system calls we make, +but increase the time to detect new log lines. +[NOTE] +Discovering new files and checking whether they have grown/or shrunk occurs in a loop. +This loop will sleep for `stat_interval` seconds before looping again. However, if files +have grown, the new content is read and lines are enqueued. +Reading and enqueuing across all grown files can take time, especially if +the pipeline is congested. So the overall loop time is a combination of the +`stat_interval` and the file read time. + +[id="plugins-{type}s-{plugin}-common-options"] +include::{include_path}/{type}.asciidoc[] + + +[id="plugins-{type}s-{plugin}-string_duration"] +// Move this to the includes when we make string durations available generally. +==== String Durations + +Format is `number` `string` and the space between these is optional. +So "45s" and "45 s" are both valid. +[TIP] +Use the most suitable duration, for example, "3 days" rather than "72 hours". + +===== Weeks +Supported values: `w` `week` `weeks`, e.g. "2 w", "1 week", "4 weeks". + +===== Days +Supported values: `d` `day` `days`, e.g. "2 d", "1 day", "2.5 days". + +===== Hours +Supported values: `h` `hour` `hours`, e.g. "4 h", "1 hour", "0.5 hours". + +===== Minutes +Supported values: `m` `min` `minute` `minutes`, e.g. "45 m", "35 min", "1 minute", "6 minutes". + +===== Seconds +Supported values: `s` `sec` `second` `seconds`, e.g. "45 s", "15 sec", "1 second", "2.5 seconds". + +===== Milliseconds +Supported values: `ms` `msec` `msecs`, e.g. "500 ms", "750 msec", "50 msecs +[NOTE] +`milli` `millis` and `milliseconds` are not supported + +===== Microseconds +Supported values: `us` `usec` `usecs`, e.g. "600 us", "800 usec", "900 usecs" +[NOTE] +`micro` `micros` and `microseconds` are not supported + +:default_codec!: diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar new file mode 100644 index 00000000..e6441136 Binary files /dev/null and b/gradle/wrapper/gradle-wrapper.jar differ diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties new file mode 100644 index 00000000..b82aa23a --- /dev/null +++ b/gradle/wrapper/gradle-wrapper.properties @@ -0,0 +1,7 @@ +distributionBase=GRADLE_USER_HOME +distributionPath=wrapper/dists +distributionUrl=https\://services.gradle.org/distributions/gradle-8.7-bin.zip +networkTimeout=10000 +validateDistributionUrl=true +zipStoreBase=GRADLE_USER_HOME +zipStorePath=wrapper/dists diff --git a/gradlew b/gradlew new file mode 100755 index 00000000..1aa94a42 --- /dev/null +++ b/gradlew @@ -0,0 +1,249 @@ +#!/bin/sh + +# +# Copyright © 2015-2021 the original authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +############################################################################## +# +# Gradle start up script for POSIX generated by Gradle. +# +# Important for running: +# +# (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is +# noncompliant, but you have some other compliant shell such as ksh or +# bash, then to run this script, type that shell name before the whole +# command line, like: +# +# ksh Gradle +# +# Busybox and similar reduced shells will NOT work, because this script +# requires all of these POSIX shell features: +# * functions; +# * expansions «$var», «${var}», «${var:-default}», «${var+SET}», +# «${var#prefix}», «${var%suffix}», and «$( cmd )»; +# * compound commands having a testable exit status, especially «case»; +# * various built-in commands including «command», «set», and «ulimit». +# +# Important for patching: +# +# (2) This script targets any POSIX shell, so it avoids extensions provided +# by Bash, Ksh, etc; in particular arrays are avoided. +# +# The "traditional" practice of packing multiple parameters into a +# space-separated string is a well documented source of bugs and security +# problems, so this is (mostly) avoided, by progressively accumulating +# options in "$@", and eventually passing that to Java. +# +# Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS, +# and GRADLE_OPTS) rely on word-splitting, this is performed explicitly; +# see the in-line comments for details. +# +# There are tweaks for specific operating systems such as AIX, CygWin, +# Darwin, MinGW, and NonStop. +# +# (3) This script is generated from the Groovy template +# https://github.com/gradle/gradle/blob/HEAD/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt +# within the Gradle project. +# +# You can find Gradle at https://github.com/gradle/gradle/. +# +############################################################################## + +# Attempt to set APP_HOME + +# Resolve links: $0 may be a link +app_path=$0 + +# Need this for daisy-chained symlinks. +while + APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path + [ -h "$app_path" ] +do + ls=$( ls -ld "$app_path" ) + link=${ls#*' -> '} + case $link in #( + /*) app_path=$link ;; #( + *) app_path=$APP_HOME$link ;; + esac +done + +# This is normally unused +# shellcheck disable=SC2034 +APP_BASE_NAME=${0##*/} +# Discard cd standard output in case $CDPATH is set (https://github.com/gradle/gradle/issues/25036) +APP_HOME=$( cd "${APP_HOME:-./}" > /dev/null && pwd -P ) || exit + +# Use the maximum available, or set MAX_FD != -1 to use that value. +MAX_FD=maximum + +warn () { + echo "$*" +} >&2 + +die () { + echo + echo "$*" + echo + exit 1 +} >&2 + +# OS specific support (must be 'true' or 'false'). +cygwin=false +msys=false +darwin=false +nonstop=false +case "$( uname )" in #( + CYGWIN* ) cygwin=true ;; #( + Darwin* ) darwin=true ;; #( + MSYS* | MINGW* ) msys=true ;; #( + NONSTOP* ) nonstop=true ;; +esac + +CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar + + +# Determine the Java command to use to start the JVM. +if [ -n "$JAVA_HOME" ] ; then + if [ -x "$JAVA_HOME/jre/sh/java" ] ; then + # IBM's JDK on AIX uses strange locations for the executables + JAVACMD=$JAVA_HOME/jre/sh/java + else + JAVACMD=$JAVA_HOME/bin/java + fi + if [ ! -x "$JAVACMD" ] ; then + die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." + fi +else + JAVACMD=java + if ! command -v java >/dev/null 2>&1 + then + die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." + fi +fi + +# Increase the maximum file descriptors if we can. +if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then + case $MAX_FD in #( + max*) + # In POSIX sh, ulimit -H is undefined. That's why the result is checked to see if it worked. + # shellcheck disable=SC2039,SC3045 + MAX_FD=$( ulimit -H -n ) || + warn "Could not query maximum file descriptor limit" + esac + case $MAX_FD in #( + '' | soft) :;; #( + *) + # In POSIX sh, ulimit -n is undefined. That's why the result is checked to see if it worked. + # shellcheck disable=SC2039,SC3045 + ulimit -n "$MAX_FD" || + warn "Could not set maximum file descriptor limit to $MAX_FD" + esac +fi + +# Collect all arguments for the java command, stacking in reverse order: +# * args from the command line +# * the main class name +# * -classpath +# * -D...appname settings +# * --module-path (only if needed) +# * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables. + +# For Cygwin or MSYS, switch paths to Windows format before running java +if "$cygwin" || "$msys" ; then + APP_HOME=$( cygpath --path --mixed "$APP_HOME" ) + CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" ) + + JAVACMD=$( cygpath --unix "$JAVACMD" ) + + # Now convert the arguments - kludge to limit ourselves to /bin/sh + for arg do + if + case $arg in #( + -*) false ;; # don't mess with options #( + /?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath + [ -e "$t" ] ;; #( + *) false ;; + esac + then + arg=$( cygpath --path --ignore --mixed "$arg" ) + fi + # Roll the args list around exactly as many times as the number of + # args, so each arg winds up back in the position where it started, but + # possibly modified. + # + # NB: a `for` loop captures its iteration list before it begins, so + # changing the positional parameters here affects neither the number of + # iterations, nor the values presented in `arg`. + shift # remove old arg + set -- "$@" "$arg" # push replacement arg + done +fi + + +# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' + +# Collect all arguments for the java command: +# * DEFAULT_JVM_OPTS, JAVA_OPTS, JAVA_OPTS, and optsEnvironmentVar are not allowed to contain shell fragments, +# and any embedded shellness will be escaped. +# * For example: A user cannot expect ${Hostname} to be expanded, as it is an environment variable and will be +# treated as '${Hostname}' itself on the command line. + +set -- \ + "-Dorg.gradle.appname=$APP_BASE_NAME" \ + -classpath "$CLASSPATH" \ + org.gradle.wrapper.GradleWrapperMain \ + "$@" + +# Stop when "xargs" is not available. +if ! command -v xargs >/dev/null 2>&1 +then + die "xargs is not available" +fi + +# Use "xargs" to parse quoted args. +# +# With -n1 it outputs one arg per line, with the quotes and backslashes removed. +# +# In Bash we could simply go: +# +# readarray ARGS < <( xargs -n1 <<<"$var" ) && +# set -- "${ARGS[@]}" "$@" +# +# but POSIX shell has neither arrays nor command substitution, so instead we +# post-process each arg (as a line of input to sed) to backslash-escape any +# character that might be a shell metacharacter, then use eval to reverse +# that process (while maintaining the separation between arguments), and wrap +# the whole thing up as a single "set" statement. +# +# This will of course break if any of these variables contains a newline or +# an unmatched quote. +# + +eval "set -- $( + printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" | + xargs -n1 | + sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' | + tr '\n' ' ' + )" '"$@"' + +exec "$JAVACMD" "$@" diff --git a/gradlew.bat b/gradlew.bat new file mode 100644 index 00000000..7101f8e4 --- /dev/null +++ b/gradlew.bat @@ -0,0 +1,92 @@ +@rem +@rem Copyright 2015 the original author or authors. +@rem +@rem Licensed under the Apache License, Version 2.0 (the "License"); +@rem you may not use this file except in compliance with the License. +@rem You may obtain a copy of the License at +@rem +@rem https://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, software +@rem distributed under the License is distributed on an "AS IS" BASIS, +@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@rem See the License for the specific language governing permissions and +@rem limitations under the License. +@rem + +@if "%DEBUG%"=="" @echo off +@rem ########################################################################## +@rem +@rem Gradle startup script for Windows +@rem +@rem ########################################################################## + +@rem Set local scope for the variables with windows NT shell +if "%OS%"=="Windows_NT" setlocal + +set DIRNAME=%~dp0 +if "%DIRNAME%"=="" set DIRNAME=. +@rem This is normally unused +set APP_BASE_NAME=%~n0 +set APP_HOME=%DIRNAME% + +@rem Resolve any "." and ".." in APP_HOME to make it shorter. +for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi + +@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" + +@rem Find java.exe +if defined JAVA_HOME goto findJavaFromJavaHome + +set JAVA_EXE=java.exe +%JAVA_EXE% -version >NUL 2>&1 +if %ERRORLEVEL% equ 0 goto execute + +echo. 1>&2 +echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 1>&2 +echo. 1>&2 +echo Please set the JAVA_HOME variable in your environment to match the 1>&2 +echo location of your Java installation. 1>&2 + +goto fail + +:findJavaFromJavaHome +set JAVA_HOME=%JAVA_HOME:"=% +set JAVA_EXE=%JAVA_HOME%/bin/java.exe + +if exist "%JAVA_EXE%" goto execute + +echo. 1>&2 +echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 1>&2 +echo. 1>&2 +echo Please set the JAVA_HOME variable in your environment to match the 1>&2 +echo location of your Java installation. 1>&2 + +goto fail + +:execute +@rem Setup the command line + +set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar + + +@rem Execute Gradle +"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* + +:end +@rem End local scope for the variables with windows NT shell +if %ERRORLEVEL% equ 0 goto mainEnd + +:fail +rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of +rem the _cmd.exe /c_ return code! +set EXIT_CODE=%ERRORLEVEL% +if %EXIT_CODE% equ 0 set EXIT_CODE=1 +if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE% +exit /b %EXIT_CODE% + +:mainEnd +if "%OS%"=="Windows_NT" endlocal + +:omega diff --git a/lib/filewatch/bootstrap.rb b/lib/filewatch/bootstrap.rb new file mode 100644 index 00000000..d4779aea --- /dev/null +++ b/lib/filewatch/bootstrap.rb @@ -0,0 +1,81 @@ +# encoding: utf-8 +require "pathname" + +## Common setup +# all the required constants and files +# defined in one place +module FileWatch + # the number of bytes read from a file during the read phase + FILE_READ_SIZE = 32768 + # the largest fixnum in ruby + # this is used in the read loop e.g. + # @opts[:file_chunk_count].times do + # where file_chunk_count defaults to this constant + MAX_ITERATIONS = (2**(0.size * 8 - 2) - 2) / 32768 + + require_relative "helper" + + gem_root_dir = Pathname.new(__FILE__).dirname.join("../../").realpath + jar_version = gem_root_dir.join("JAR_VERSION").read.strip + fullpath = gem_root_dir.join("lib/jars/filewatch-#{jar_version}.jar").expand_path.to_path + require "java" + require fullpath + require "jruby_file_watch" + + if LogStash::Environment.windows? + require_relative "winhelper" + require_relative "stat/windows_path" + PathStatClass = Stat::WindowsPath + FileOpener = FileExt + else + require_relative "stat/generic" + PathStatClass = Stat::Generic + FileOpener = ::File + end + + # Structs can be used as hash keys because they compare by value + # this is used as the key for values in the sincedb hash + InodeStruct = Struct.new(:inode, :maj, :min) do + def to_s + to_a.join(" ") + end + end + + BufferExtractResult = Struct.new(:lines, :warning, :additional) + + class LoopControlResult + attr_reader :count, :size, :more + + def initialize(count, size, more) + @count, @size, @more = count, size, more + @read_error_detected = false + end + + def flag_read_error + @read_error_detected = true + end + + def keep_looping? + !@read_error_detected && @more + end + end + + class NoSinceDBPathGiven < StandardError; end + + # how often (in seconds) we logger.warn a failed file open, per path. + OPEN_WARN_INTERVAL = ENV.fetch("FILEWATCH_OPEN_WARN_INTERVAL", 300).to_i + MAX_FILES_WARN_INTERVAL = ENV.fetch("FILEWATCH_MAX_FILES_WARN_INTERVAL", 20).to_i + + require "logstash/util/buftok" + require_relative "settings" + require_relative "sincedb_value" + require_relative "sincedb_record_serializer" + require_relative "watched_files_collection" + require_relative "sincedb_collection" + require_relative "watch" + require_relative "watched_file" + require_relative "discoverer" + require_relative "observing_base" + require_relative "observing_tail" + require_relative "observing_read" +end diff --git a/lib/filewatch/discoverer.rb b/lib/filewatch/discoverer.rb new file mode 100644 index 00000000..10723235 --- /dev/null +++ b/lib/filewatch/discoverer.rb @@ -0,0 +1,107 @@ +# encoding: utf-8 +require "logstash/util/loggable" + +module FileWatch + class Discoverer + # given a path or glob will prepare for and discover files to watch + # if they are not excluded or ignorable + # they are added to the watched_files collection and + # associated with a sincedb entry if one can be found + include LogStash::Util::Loggable + + attr_reader :watched_files_collection + + def initialize(watched_files_collection, sincedb_collection, settings) + @watching = Concurrent::Array.new + @exclude = Concurrent::Array.new + @watched_files_collection = watched_files_collection + @sincedb_collection = sincedb_collection + @settings = settings + @settings.exclude.each { |p| @exclude << p } + end + + def add_path(path) + return if @watching.member?(path) + @watching << path + discover_files_new_path(path) + self + end + + def discover + @watching.each do |path| + discover_files_ongoing(path) + end + end + + private + + def can_exclude?(watched_file, new_discovery) + @exclude.each do |pattern| + if watched_file.pathname.basename.fnmatch?(pattern) + if new_discovery + logger.trace("skipping file because it matches exclude", :path => watched_file.path, :pattern => pattern) + end + watched_file.unwatch + return true + end + end + false + end + + def discover_files_new_path(path) + discover_any_files(path, false) + end + + def discover_files_ongoing(path) + discover_any_files(path, true) + end + + def discover_any_files(path, ongoing) + fileset = Dir.glob(path).select { |f| File.file?(f) } + logger.trace("discover_files", :count => fileset.size) + fileset.each do |file| + new_discovery = false + watched_file = @watched_files_collection.get(file) + if watched_file.nil? + pathname = Pathname.new(file) + begin + path_stat = PathStatClass.new(pathname) + rescue Errno::ENOENT + next + end + watched_file = WatchedFile.new(pathname, path_stat, @settings) + new_discovery = true + end + # if it already unwatched or its excluded then we can skip + next if watched_file.unwatched? || can_exclude?(watched_file, new_discovery) + + logger.trace? && logger.trace("handling:", :new_discovery => new_discovery, :watched_file => watched_file.details) + + if new_discovery + watched_file.initial_completed if ongoing + # initially when the sincedb collection is filled with records from the persistence file + # each value is not associated with a watched file + # a sincedb_value can be: + # unassociated + # associated with this watched_file + # associated with a different watched_file + if @sincedb_collection.associate(watched_file) + if watched_file.file_ignorable? + logger.trace("skipping file because it was last modified more than #{@settings.ignore_older} seconds ago", :path => file) + # on discovery ignorable watched_files are put into the ignored state and that + # updates the size from the internal stat + # so the existing contents are not read. + # because, normally, a newly discovered file will + # have a watched_file size of zero + # they are still added to the collection so we know they are there for the next periodic discovery + watched_file.ignore_as_unread + end + # now add the discovered file to the watched_files collection and adjust the sincedb collections + @watched_files_collection.add(watched_file) + end + end + # at this point the watched file is created, is in the db but not yet opened or being processed + end + end + end +end diff --git a/lib/filewatch/helper.rb b/lib/filewatch/helper.rb new file mode 100644 index 00000000..2fbe1a27 --- /dev/null +++ b/lib/filewatch/helper.rb @@ -0,0 +1,68 @@ +# encoding: utf-8 +# code downloaded from Ruby on Rails 4.2.1 +# https://raw.githubusercontent.com/rails/rails/v4.2.1/activesupport/lib/active_support/core_ext/file/atomic.rb +# change method name to avoid borking active_support and vice versa +require 'fileutils' + +module FileHelper + extend self + # Write to a file atomically. Useful for situations where you don't + # want other processes or threads to see half-written files. + # + # File.write_atomically('important.file') do |file| + # file.write('hello') + # end + def write_atomically(file_name) + + if File.exist?(file_name) + # Get original file permissions + old_stat = File.stat(file_name) + else + # If not possible, probe which are the default permissions in the + # destination directory. + old_stat = probe_stat_in(File.dirname(file_name)) + end + + mode = old_stat ? old_stat.mode : nil + + # Create temporary file with identical permissions + temp_file = File.new(rand_filename(file_name), "w", mode) + temp_file.binmode + return_val = yield temp_file + temp_file.close + new_stat = File.stat(temp_file) + + # Overwrite original file with temp file + File.rename(temp_file.path, file_name) + + # Unable to get permissions of the original file => return + return return_val if old_stat.nil? + + # Set correct uid/gid on new file if ownership is different. + if old_stat && (old_stat.gid != new_stat.gid || old_stat.uid != new_stat.uid) + File.chown(old_stat.uid, old_stat.gid, file_name) if old_stat + end + + return_val + end + + def device?(file_name) + File.chardev?(file_name) || File.blockdev?(file_name) + end + + # Private utility method. + def probe_stat_in(dir) #:nodoc: + basename = rand_filename(".permissions_check") + file_name = File.join(dir, basename) + FileUtils.touch(file_name) + File.stat(file_name) + rescue + # ... + ensure + FileUtils.rm_f(file_name) if File.exist?(file_name) + end + + def rand_filename(prefix) + [ prefix, Thread.current.object_id, Process.pid, rand(1000000) ].join('.') + end +end diff --git a/lib/filewatch/observing_base.rb b/lib/filewatch/observing_base.rb new file mode 100644 index 00000000..06cd3170 --- /dev/null +++ b/lib/filewatch/observing_base.rb @@ -0,0 +1,87 @@ +# encoding: utf-8 + +## Interface API topology +# ObservingBase module (this file) +# is a module mixin proving common constructor and external API for File Input Plugin interaction +# calls build_specific_processor on ObservingRead or ObservingTail +# ObservingRead and ObservingTail +# provides the External API method subscribe(observer = NullObserver.new) +# build_specific_processor(settings) - provide a Tail or Read specific Processor. +# TailMode::Processor or ReadMode::Processor +# initialize_handlers(sincedb_collection, observer) - called when the observer subscribes to changes in a Mode, +# builds mode specific handler instances with references to the observer +# process_closed(watched_files) - provide specific processing of watched_files in the closed state +# process_ignored(watched_files) - provide specific processing of watched_files in the ignored state +# process_watched(watched_files) - provide specific processing of watched_files in the watched state +# process_active(watched_files) - provide specific processing of watched_files in the active state +# These methods can call "handler" methods that delegate to the specific Handler classes. +# TailMode::Handlers module namespace +# contains the Handler classes that deals with Tail mode file lifecycle "events". +# The TailMode::Handlers::Base +# handle(watched_file) - this method calls handle_specifically defined in a subclass +# handle_specifically(watched_file) - this is a noop method +# update_existing_specifically(watched_file, sincedb_value) - this is a noop method +# Each handler extends the Base class to provide specific implementations of these two methods: +# handle_specifically(watched_file) +# update_existing_specifically(watched_file, sincedb_value) +# ReadMode::Handlers module namespace +# contains the Handler classes that deals with Read mode file lifecycle "events". +# The ReadMode::Handlers::Base +# handle(watched_file) - this method calls handle_specifically defined in a subclass +# handle_specifically(watched_file) - this is a noop method +# Each handler extends the Base class to provide specific implementations of this method: +# handle_specifically(watched_file) + +module FileWatch + module ObservingBase + attr_reader :watch, :sincedb_collection, :settings + + def initialize(opts={}) + options = { + :sincedb_write_interval => 10, + :stat_interval => 1, + :discover_interval => 5, + :exclude => [], + :start_new_files_at => :end, + :delimiter => "\n", + :file_chunk_count => MAX_ITERATIONS, + :file_chunk_size => FILE_READ_SIZE, + :file_sort_by => "last_modified", + :file_sort_direction => "asc", + }.merge(opts) + unless options.include?(:sincedb_path) + raise NoSinceDBPathGiven.new("No sincedb_path set in options. This should have been added in the main LogStash::Inputs::File class") + end + @settings = Settings.from_options(options) + build_watch_and_dependencies + end + + def build_watch_and_dependencies + logger.info("START, creating Discoverer, Watch with file and sincedb collections") + watched_files_collection = WatchedFilesCollection.new(@settings) + @sincedb_collection = SincedbCollection.new(@settings) + @sincedb_collection.open + discoverer = Discoverer.new(watched_files_collection, @sincedb_collection, @settings) + @watch = Watch.new(discoverer, build_specific_processor(@settings), @settings) + end + + def watch_this(path) + @watch.watch(path) + end + + def sincedb_write(reason=nil) + # can be invoked from the file input + @sincedb_collection.write(reason) + end + + # quit is a sort-of finalizer, + # it should be called for clean up + # before the instance is disposed of. + def quit + logger.info("QUIT - closing all files and shutting down.") + @watch.quit # <-- should close all the files + # sincedb_write("shutting down") + end + + end +end diff --git a/lib/filewatch/observing_read.rb b/lib/filewatch/observing_read.rb new file mode 100644 index 00000000..2a6316e6 --- /dev/null +++ b/lib/filewatch/observing_read.rb @@ -0,0 +1,22 @@ +# encoding: utf-8 +require "logstash/util/loggable" +require_relative "read_mode/processor" + +module FileWatch + class ObservingRead + include LogStash::Util::Loggable + include ObservingBase + + def subscribe(observer) + # observer here is the file input + watch.subscribe(observer, sincedb_collection) + sincedb_collection.write("read mode subscribe complete - shutting down") + end + + private + + def build_specific_processor(settings) + ReadMode::Processor.new(settings) + end + end +end diff --git a/lib/filewatch/observing_tail.rb b/lib/filewatch/observing_tail.rb new file mode 100644 index 00000000..e2f51b6f --- /dev/null +++ b/lib/filewatch/observing_tail.rb @@ -0,0 +1,22 @@ +# encoding: utf-8 +require "logstash/util/loggable" +require_relative 'tail_mode/processor' + +module FileWatch + class ObservingTail + include LogStash::Util::Loggable + include ObservingBase + + def subscribe(observer) + # observer here is the file input + watch.subscribe(observer, sincedb_collection) + sincedb_collection.write("tail mode subscribe complete - shutting down") + end + + private + + def build_specific_processor(settings) + TailMode::Processor.new(settings) + end + end +end diff --git a/lib/filewatch/processor.rb b/lib/filewatch/processor.rb new file mode 100644 index 00000000..350fa875 --- /dev/null +++ b/lib/filewatch/processor.rb @@ -0,0 +1,55 @@ +# encoding: utf-8 +require "logstash/util/loggable" +require 'concurrent/atomic/atomic_reference' + +module FileWatch + class Processor + include LogStash::Util::Loggable + + attr_reader :watch + + def initialize(settings) + @settings = settings + @deletable_paths = Concurrent::AtomicReference.new [] + end + + def add_watch(watch) + @watch = watch + self + end + + def clear_deletable_paths + @deletable_paths.get_and_set [] + end + + def add_deletable_path(path) + @deletable_paths.get << path + end + + def restat(watched_file) + changed = watched_file.restat! + if changed + # the collection (when sorted by modified_at) needs to re-sort every time watched-file is modified, + # we can perform these update operation while processing files (stat interval) instead of having to + # re-sort the whole collection every time an entry is accessed + @watch.watched_files_collection.update(watched_file) + end + end + + private + + def error_details(error, watched_file) + details = { :path => watched_file.path, + :exception => error.class, + :message => error.message, + :backtrace => error.backtrace } + if logger.debug? + details[:file] = watched_file + else + details[:backtrace] = details[:backtrace].take(8) if details[:backtrace] + end + details + end + + end +end diff --git a/lib/filewatch/read_mode/handlers/base.rb b/lib/filewatch/read_mode/handlers/base.rb new file mode 100644 index 00000000..ce02f8e6 --- /dev/null +++ b/lib/filewatch/read_mode/handlers/base.rb @@ -0,0 +1,100 @@ +# encoding: utf-8 +require "logstash/util/loggable" + +module FileWatch module ReadMode module Handlers + class Base + include LogStash::Util::Loggable + + attr_reader :sincedb_collection + + def initialize(processor, sincedb_collection, observer, settings) + @settings = settings + @processor = processor + @sincedb_collection = sincedb_collection + @observer = observer + end + + def quit? + @processor.watch.quit? + end + + def handle(watched_file) + logger.trace? && logger.trace("handling:", :path => watched_file.path) + unless watched_file.has_listener? + watched_file.set_listener(@observer) + end + handle_specifically(watched_file) + end + + def handle_specifically(watched_file) + # some handlers don't need to define this method + end + + private + + def open_file(watched_file) + return true if watched_file.file_open? + logger.trace? && logger.trace("opening", :path => watched_file.path) + begin + watched_file.open + rescue => e + # don't emit this message too often. if a file that we can't + # read is changing a lot, we'll try to open it more often, and spam the logs. + now = Time.now.to_i + logger.trace? && logger.trace("opening OPEN_WARN_INTERVAL is '#{OPEN_WARN_INTERVAL}'") + if watched_file.last_open_warning_at.nil? || now - watched_file.last_open_warning_at > OPEN_WARN_INTERVAL + backtrace = e.backtrace + backtrace = backtrace.take(3) if backtrace && !logger.debug? + logger.warn("failed to open", :path => watched_file.path, :exception => e.class, :message => e.message, :backtrace => backtrace) + watched_file.last_open_warning_at = now + else + logger.trace? && logger.trace("suppressed warning (failed to open)", :path => watched_file.path, :exception => e.class, :message => e.message) + end + watched_file.watch # set it back to watch so we can try it again + end + if watched_file.file_open? + watched_file.listener.opened + true + else + false + end + end + + def add_or_update_sincedb_collection(watched_file) + sincedb_value = @sincedb_collection.find(watched_file) + if sincedb_value.nil? + add_new_value_sincedb_collection(watched_file) + elsif sincedb_value.watched_file == watched_file + update_existing_sincedb_collection_value(watched_file, sincedb_value) + else + logger.trace? && logger.trace("add_or_update_sincedb_collection: the found sincedb_value has a watched_file - this is a rename, switching inode to this watched file") + existing_watched_file = sincedb_value.watched_file + if existing_watched_file.nil? + sincedb_value.set_watched_file(watched_file) + logger.trace("add_or_update_sincedb_collection: switching as new file") + watched_file.rotate_as_file + watched_file.update_bytes_read(sincedb_value.position) + else + sincedb_value.set_watched_file(watched_file) + logger.trace? && logger.trace("add_or_update_sincedb_collection: switching from", :watched_file => watched_file.details) + watched_file.rotate_from(existing_watched_file) + end + + end + watched_file.initial_completed + end + + def update_existing_sincedb_collection_value(watched_file, sincedb_value) + logger.trace? && logger.trace("update_existing_sincedb_collection_value: #{watched_file.path}, last value #{sincedb_value.position}, cur size #{watched_file.last_stat_size}") + # sincedb_value is the source of truth + watched_file.update_bytes_read(sincedb_value.position) + end + + def add_new_value_sincedb_collection(watched_file) + sincedb_value = SincedbValue.new(0) + sincedb_value.set_watched_file(watched_file) + logger.trace? && logger.trace("add_new_value_sincedb_collection:", :path => watched_file.path, :position => sincedb_value.position) + sincedb_collection.set(watched_file.sincedb_key, sincedb_value) + end + end +end end end diff --git a/lib/filewatch/read_mode/handlers/read_file.rb b/lib/filewatch/read_mode/handlers/read_file.rb new file mode 100644 index 00000000..824ac2cf --- /dev/null +++ b/lib/filewatch/read_mode/handlers/read_file.rb @@ -0,0 +1,90 @@ +# encoding: utf-8 + +module FileWatch module ReadMode module Handlers + class ReadFile < Base + + # seek file to which ever is furthest: either current bytes read or sincedb position + private + def seek_to_furthest_position(watched_file) + previous_pos = sincedb_collection.find(watched_file).position + watched_file.file_seek([watched_file.bytes_read, previous_pos].max) + end + + public + def handle_specifically(watched_file) + if open_file(watched_file) + add_or_update_sincedb_collection(watched_file) unless sincedb_collection.member?(watched_file.sincedb_key) + seek_to_furthest_position(watched_file) + loop do + break if quit? + loop_control = watched_file.loop_control_adjusted_for_stat_size + controlled_read(watched_file, loop_control) + sincedb_collection.request_disk_flush + break unless loop_control.keep_looping? + end + if watched_file.all_read? + # flush the buffer now in case there is no final delimiter + line = watched_file.buffer.flush + watched_file.listener.accept(line) unless line.empty? + watched_file.listener.eof + watched_file.file_close + key = watched_file.sincedb_key + if sincedb_collection.get(key) + sincedb_collection.reading_completed(key) + sincedb_collection.clear_watched_file(key) + end + watched_file.listener.deleted + # NOTE: on top of un-watching we should also remove from the watched files collection + # if the file is getting deleted (on completion), that part currently resides in + # DeleteCompletedFileHandler - triggered above using `watched_file.listener.deleted` + watched_file.unwatch + end + end + end + + def controlled_read(watched_file, loop_control) + logger.trace? && logger.trace("reading...", :filename => watched_file.filename, :iterations => loop_control.count, :amount => loop_control.size) + loop_control.count.times do + break if quit? + begin + result = watched_file.read_extract_lines(loop_control.size) # expect BufferExtractResult + logger.info(result.warning, result.additional) unless result.warning.empty? + result.lines.each do |line| + watched_file.listener.accept(line) + # sincedb position is independent from the watched_file bytes_read + delta = line.bytesize + @settings.delimiter_byte_size + sincedb_collection.increment(watched_file.sincedb_key, delta) + break if quit? + end + rescue EOFError => e + log_error("controlled_read: eof error reading file", watched_file, e) + loop_control.flag_read_error + break + rescue Errno::EWOULDBLOCK, Errno::EINTR => e + log_error("controlled_read: block or interrupt error reading file", watched_file, e) + watched_file.listener.error + loop_control.flag_read_error + break + rescue => e + log_error("controlled_read: general error reading file", watched_file, e) + watched_file.listener.error + loop_control.flag_read_error + break + end + end + end + + def log_error(msg, watched_file, error) + details = { :path => watched_file.path, + :exception => error.class, + :message => error.message, + :backtrace => error.backtrace } + if logger.debug? + details[:file] = watched_file + else + details[:backtrace] = details[:backtrace].take(8) if details[:backtrace] + end + logger.error(msg, details) + end + end +end end end diff --git a/lib/filewatch/read_mode/handlers/read_zip_file.rb b/lib/filewatch/read_mode/handlers/read_zip_file.rb new file mode 100644 index 00000000..e7eb21f2 --- /dev/null +++ b/lib/filewatch/read_mode/handlers/read_zip_file.rb @@ -0,0 +1,92 @@ +# encoding: utf-8 +require 'java' + +module FileWatch module ReadMode module Handlers + + java_import java.io.InputStream + java_import java.io.InputStreamReader + java_import java.io.FileInputStream + java_import java.io.BufferedReader + java_import java.util.zip.GZIPInputStream + java_import java.util.zip.ZipException + + class ReadZipFile < Base + def handle_specifically(watched_file) + add_or_update_sincedb_collection(watched_file) unless sincedb_collection.member?(watched_file.sincedb_key) + # can't really stripe read a zip file, its all or nothing. + watched_file.listener.opened + # what do we do about quit when we have just begun reading the zipped file (e.g. pipeline reloading) + # should we track lines read in the sincedb and + # fast forward through the lines until we reach unseen content? + # meaning that we can quit in the middle of a zip file + key = watched_file.sincedb_key + + if @settings.check_archive_validity && corrupted?(watched_file) + watched_file.unwatch + else + begin + file_stream = FileInputStream.new(watched_file.path) + gzip_stream = GZIPInputStream.new(file_stream) + decoder = InputStreamReader.new(gzip_stream, "UTF-8") + buffered = BufferedReader.new(decoder) + while (line = buffered.readLine()) + watched_file.listener.accept(line) + # can't quit, if we did then we would incorrectly write a 'completed' sincedb entry + # what do we do about quit when we have just begun reading the zipped file (e.g. pipeline reloading) + # should we track lines read in the sincedb and + # fast forward through the lines until we reach unseen content? + # meaning that we can quit in the middle of a zip file + end + watched_file.listener.eof + rescue ZipException => e + logger.error("Cannot decompress the gzip file at path: #{watched_file.path}", :exception => e.class, + :message => e.message, :backtrace => e.backtrace) + watched_file.listener.error + else + sincedb_collection.store_last_read(key, watched_file.last_stat_size) + sincedb_collection.request_disk_flush + watched_file.listener.deleted + watched_file.unwatch + ensure + # rescue each close individually so all close attempts are tried + close_and_ignore_ioexception(buffered) unless buffered.nil? + close_and_ignore_ioexception(decoder) unless decoder.nil? + close_and_ignore_ioexception(gzip_stream) unless gzip_stream.nil? + close_and_ignore_ioexception(file_stream) unless file_stream.nil? + end + end + sincedb_collection.clear_watched_file(key) + end + + private + + def close_and_ignore_ioexception(closeable) + begin + closeable.close + rescue Exception => e # IOException can be thrown by any of the Java classes that implement the Closable interface. + logger.warn("Ignoring an IOException when closing an instance of #{closeable.class.name}", + :exception => e.class, :message => e.message, :backtrace => e.backtrace) + end + end + + def corrupted?(watched_file) + begin + start = Time.new + file_stream = FileInputStream.new(watched_file.path) + gzip_stream = GZIPInputStream.new(file_stream) + buffer = Java::byte[8192].new + until gzip_stream.read(buffer) == -1 + end + return false + rescue ZipException, Java::JavaIo::EOFException => e + duration = Time.now - start + logger.warn("Detected corrupted archive #{watched_file.path} file won't be processed", :message => e.message, + :duration => duration.round(3)) + return true + ensure + close_and_ignore_ioexception(gzip_stream) unless gzip_stream.nil? + close_and_ignore_ioexception(file_stream) unless file_stream.nil? + end + end + end +end end end diff --git a/lib/filewatch/read_mode/processor.rb b/lib/filewatch/read_mode/processor.rb new file mode 100644 index 00000000..09ee7025 --- /dev/null +++ b/lib/filewatch/read_mode/processor.rb @@ -0,0 +1,118 @@ +# encoding: utf-8 +require 'filewatch/processor' +require_relative "handlers/base" +require_relative "handlers/read_file" +require_relative "handlers/read_zip_file" + +module FileWatch module ReadMode + # Must handle + # :read_file + # :read_zip_file + class Processor < FileWatch::Processor + + def initialize_handlers(sincedb_collection, observer) + # we deviate from the tail mode handler initialization here + # by adding a reference to self so we can read the quit flag during a (depth first) read loop + @read_file = Handlers::ReadFile.new(self, sincedb_collection, observer, @settings) + @read_zip_file = Handlers::ReadZipFile.new(self, sincedb_collection, observer, @settings) + end + + def read_file(watched_file) + @read_file.handle(watched_file) + end + + def read_zip_file(watched_file) + @read_zip_file.handle(watched_file) + end + + def process_all_states(watched_files) + process_watched(watched_files) + return if watch.quit? + process_active(watched_files) + end + + private + + def process_watched(watched_files) + logger.trace(__method__.to_s) + # Handles watched_files in the watched state. + # for a slice of them: + # move to the active state + # should never have been active before + # how much of the max active window is available + to_take = @settings.max_active - watched_files.count { |wf| wf.active? } + if to_take > 0 + watched_files.select(&:watched?).take(to_take).each do |watched_file| + begin + restat(watched_file) + watched_file.activate + rescue Errno::ENOENT + common_deleted_reaction(watched_file, __method__) + next + rescue => e + common_error_reaction(watched_file, e, __method__) + next + end + break if watch.quit? + end + else + now = Time.now.to_i + if (now - watch.lastwarn_max_files) > MAX_FILES_WARN_INTERVAL + waiting = watched_files.size - @settings.max_active + logger.warn("#{@settings.max_warn_msg}, files yet to open: #{waiting}") + watch.lastwarn_max_files = now + end + end + end + + ## TODO add process_rotation_in_progress + + def process_active(watched_files) + logger.trace(__method__.to_s) + # Handles watched_files in the active state. + watched_files.each do |watched_file| + next unless watched_file.active? + + begin + restat(watched_file) + rescue Errno::ENOENT + common_deleted_reaction(watched_file, __method__) + next + rescue => e + common_error_reaction(watched_file, e, __method__) + next + end + break if watch.quit? + + if watched_file.compressed? + read_zip_file(watched_file) + else + read_file(watched_file) + end + + if @settings.exit_after_read + common_detach_when_allread(watched_file) + end + # handlers take care of closing and unwatching + end + end + + def common_detach_when_allread(watched_file) + watched_file.unwatch + watched_file.listener.reading_completed + add_deletable_path watched_file.path + logger.trace? && logger.trace("whole file read, removing from collection", :path => watched_file.path) + end + + def common_deleted_reaction(watched_file, action) + # file has gone away or we can't read it anymore. + watched_file.unwatch + add_deletable_path watched_file.path + logger.trace? && logger.trace("#{action} - stat failed, removing from collection", :path => watched_file.path) + end + + def common_error_reaction(watched_file, error, action) + logger.error("#{action} - other error", error_details(error, watched_file)) + end + end +end end diff --git a/lib/filewatch/settings.rb b/lib/filewatch/settings.rb new file mode 100644 index 00000000..0bc7af56 --- /dev/null +++ b/lib/filewatch/settings.rb @@ -0,0 +1,66 @@ +# encoding: utf-8 + +module FileWatch + class Settings + attr_reader :delimiter, :close_older, :ignore_older, :delimiter_byte_size + attr_reader :max_active, :max_warn_msg, :lastwarn_max_files + attr_reader :sincedb_write_interval, :stat_interval, :discover_interval + attr_reader :exclude, :start_new_files_at, :file_chunk_count, :file_chunk_size + attr_reader :sincedb_path, :sincedb_expiry_duration + attr_reader :file_sort_by, :file_sort_direction + attr_reader :exit_after_read + attr_reader :check_archive_validity + + def self.from_options(opts) + new.add_options(opts) + end + + def initialize + defaults = { + :delimiter => "\n", + :file_chunk_size => FILE_READ_SIZE, + :max_open_files => 4095, + :file_chunk_count => MAX_ITERATIONS, + :sincedb_clean_after => 14, + :exclude => [], + :stat_interval => 1, + :discover_interval => 5, + :file_sort_by => "last_modified", + :file_sort_direction => "asc", + } + @opts = {} + @lastwarn_max_files = 0 + add_options(defaults) + end + + def add_options(opts) + @opts.update(opts) + self.max_open_files = @opts[:max_open_files] + @delimiter = @opts[:delimiter] + @delimiter_byte_size = @delimiter.bytesize + @file_chunk_size = @opts[:file_chunk_size] + @close_older = @opts[:close_older] + @ignore_older = @opts[:ignore_older] + @stat_interval = @opts[:stat_interval] + @discover_interval = @opts[:discover_interval] + @exclude = Array(@opts[:exclude]) + @start_new_files_at = @opts[:start_new_files_at] + @file_chunk_count = @opts[:file_chunk_count] + @sincedb_path = @opts[:sincedb_path] + @sincedb_write_interval = @opts[:sincedb_write_interval] + @sincedb_expiry_duration = @opts.fetch(:sincedb_clean_after) + @file_sort_by = @opts[:file_sort_by] + @file_sort_direction = @opts[:file_sort_direction] + @exit_after_read = @opts[:exit_after_read] + @check_archive_validity = @opts[:check_archive_validity] + self + end + + def max_open_files=(value) + val = value.to_i + val = 4095 if value.nil? || val <= 0 + @max_warn_msg = "Reached open files limit: #{val}, set by the 'max_open_files' option or default" + @max_active = val + end + end +end diff --git a/lib/filewatch/sincedb_collection.rb b/lib/filewatch/sincedb_collection.rb new file mode 100644 index 00000000..3694cef0 --- /dev/null +++ b/lib/filewatch/sincedb_collection.rb @@ -0,0 +1,251 @@ +# encoding: utf-8 +require "logstash/util/loggable" + +module FileWatch + # this KV collection has a watched_file storage_key (an InodeStruct) as the key + # and a SincedbValue as the value. + # the SincedbValues are built by reading the sincedb file. + class SincedbCollection + include LogStash::Util::Loggable + + attr_reader :path + attr_writer :serializer + + def initialize(settings) + @settings = settings + @sincedb_last_write = 0 + @sincedb = {} + @serializer = SincedbRecordSerializer.new(@settings.sincedb_expiry_duration) + @path = Pathname.new(@settings.sincedb_path) + @write_method = LogStash::Environment.windows? || @path.chardev? || @path.blockdev? ? method(:non_atomic_write) : method(:atomic_write) + @full_path = @path.to_path + FileUtils.touch(@full_path) + @write_requested = false + end + + def write_requested? + @write_requested + end + + def request_disk_flush + @write_requested = true + flush_at_interval + end + + def write_if_requested + if write_requested? + flush_at_interval + end + end + + def write(reason=nil) + logger.trace("caller requested sincedb write (#{reason})") + sincedb_write + end + + def open + @time_sdb_opened = Time.now.to_f + begin + path.open do |file| + logger.debug("open: reading from #{path}") + @serializer.deserialize(file) do |key, value| + logger.trace? && logger.trace("open: importing #{key.inspect} => #{value.inspect}") + set_key_value(key, value) + end + end + logger.trace("open: count of keys read: #{@sincedb.keys.size}") + rescue => e + #No existing sincedb to load + logger.debug("open: error opening #{path}", :exception => e.class, :message => e.message) + end + end + + def associate(watched_file) + logger.trace? && logger.trace("associate: finding", :path => watched_file.path, :inode => watched_file.sincedb_key.inode) + sincedb_value = find(watched_file) + if sincedb_value.nil? + # sincedb has no record of this inode + # and due to the window handling of many files + # this file may not be opened in this session. + # a new value will be added when the file is opened + logger.trace("associate: unmatched", :filename => watched_file.filename) + return true + end + logger.trace? && logger.trace("associate: found sincedb record", :filename => watched_file.filename, + :sincedb_key => watched_file.sincedb_key, :sincedb_value => sincedb_value) + if sincedb_value.watched_file.nil? # not associated + if sincedb_value.path_in_sincedb.nil? + handle_association(sincedb_value, watched_file) + logger.trace? && logger.trace("associate: inode matched but no path in sincedb", :filename => watched_file.filename) + return true + end + if sincedb_value.path_in_sincedb == watched_file.path + # the path on disk is the same as discovered path and the inode is the same. + handle_association(sincedb_value, watched_file) + logger.trace? && logger.trace("associate: inode and path matched", :filename => watched_file.filename) + return true + end + # the path on disk is different from discovered unassociated path but they have the same key (inode) + # treat as a new file, a new value will be added when the file is opened + sincedb_value.clear_watched_file + delete(watched_file.sincedb_key) + logger.trace? && logger.trace("associate: matched but allocated to another", :filename => watched_file.filename) + return true + end + if sincedb_value.watched_file.equal?(watched_file) # pointer equals + logger.trace? && logger.trace("associate: already associated", :filename => watched_file.filename) + return true + end + # sincedb_value.watched_file is not this discovered watched_file but they have the same key (inode) + # this means that the filename path was changed during this session. + # renamed file can be discovered... + # before the original is detected as deleted: state is `active` + # after the original is detected as deleted but before it is actually deleted: state is `delayed_delete` + # after the original is deleted + # are not yet in the delete phase, let this play out + existing_watched_file = sincedb_value.watched_file + logger.trace? && logger.trace("associate: found sincedb_value has a watched_file - this is a rename", + :this_watched_file => watched_file.details, :existing_watched_file => existing_watched_file.details) + watched_file.rotation_in_progress + true + end + + def find(watched_file) + get(watched_file.sincedb_key) + end + + def member?(key) + @sincedb.member?(key) + end + + def get(key) + @sincedb[key] + end + + def set(key, value) + @sincedb[key] = value + value + end + + def delete(key) + @sincedb.delete(key) + end + + def last_read(key) + @sincedb[key].position + end + + def rewind(key) + @sincedb[key].update_position(0) + end + + def increment(key, amount) + @sincedb[key].increment_position(amount) + end + + def set_watched_file(key, watched_file) + @sincedb[key].set_watched_file(watched_file) + end + + def watched_file_deleted(watched_file) + value = @sincedb[watched_file.sincedb_key] + value.unset_watched_file if value + end + + def store_last_read(key, pos) + @sincedb[key].update_position(pos) + end + + def clear_watched_file(key) + @sincedb[key].clear_watched_file + end + + def reading_completed(key) + @sincedb[key].reading_completed + end + + def clear + @sincedb.clear + end + + def keys + @sincedb.keys + end + + def watched_file_unset?(key) + return false unless member?(key) + get(key).watched_file.nil? + end + + def flush_at_interval + now = Time.now + delta = now.to_i - @sincedb_last_write + if delta >= @settings.sincedb_write_interval + logger.debug("writing sincedb (delta since last write = #{delta})") + sincedb_write(now) + end + end + + private + + def handle_association(sincedb_value, watched_file) + watched_file.update_bytes_read(sincedb_value.position) + sincedb_value.set_watched_file(watched_file) + watched_file.initial_completed + if watched_file.all_read? + watched_file.ignore + logger.trace? && logger.trace("handle_association fully read, ignoring", :watched_file => watched_file.details, :sincedb_value => sincedb_value) + end + end + + def set_key_value(key, value) + if @time_sdb_opened < value.last_changed_at_expires(@settings.sincedb_expiry_duration) + set(key, value) + else + logger.debug("set_key_value: record has expired, skipping: #{key.inspect} => #{value.inspect}") + end + end + + def sincedb_write(time = Time.now) + logger.trace? && logger.trace("sincedb_write: #{path} (time = #{time})") + begin + expired_keys = @write_method.call(time) + expired_keys.each do |key| + @sincedb[key].unset_watched_file + delete(key) + logger.trace? && logger.trace("sincedb_write: cleaned", :key => key) + end + @sincedb_last_write = time.to_i + @write_requested = false + rescue Errno::EACCES => e + # no file handles free perhaps - maybe it will work next time + logger.debug("sincedb_write: #{path} error:", :exception => e.class, :message => e.message) + end + end + + # @return expired keys + def atomic_write(time) + logger.trace? && logger.trace("non_atomic_write: ", :time => time) + begin + FileHelper.write_atomically(@full_path) do |io| + @serializer.serialize(@sincedb, io, time.to_f) + end + rescue Errno::EPERM, Errno::EACCES => e + logger.warn("sincedb_write: unable to write atomically due to permissions error, falling back to non-atomic write: #{path} error:", :exception => e.class, :message => e.message) + @write_method = method(:non_atomic_write) + non_atomic_write(time) + rescue => e + logger.warn("sincedb_write: unable to write atomically, attempting non-atomic write: #{path} error:", :exception => e.class, :message => e.message) + non_atomic_write(time) + end + end + + # @return expired keys + def non_atomic_write(time) + logger.trace? && logger.trace("non_atomic_write: ", :time => time) + File.open(@full_path, "w+") do |io| + @serializer.serialize(@sincedb, io, time.to_f) + end + end + end +end diff --git a/lib/filewatch/sincedb_record_serializer.rb b/lib/filewatch/sincedb_record_serializer.rb new file mode 100644 index 00000000..6dcc1688 --- /dev/null +++ b/lib/filewatch/sincedb_record_serializer.rb @@ -0,0 +1,68 @@ +# encoding: utf-8 + +module FileWatch + class SincedbRecordSerializer + + def self.days_to_seconds(days) + (24 * 3600) * days.to_f + end + + def initialize(sincedb_value_expiry) + @sincedb_value_expiry = sincedb_value_expiry + end + + # @return Array expired keys (ones that were not written to the file) + def serialize(db, io, as_of = Time.now.to_f) + expired_keys = [] + db.each do |key, value| + if as_of > value.last_changed_at_expires(@sincedb_value_expiry) + expired_keys << key + next + end + io.write(serialize_record(key, value)) + end + expired_keys + end + + def deserialize(io) + io.each do |record| + yield deserialize_record(record) #.tap{|val| STDERR.puts val} + end + end + + def serialize_record(k, v) + "#{k} #{v}\n" # effectively InodeStruct#to_s SincedbValue#to_s + end + + def deserialize_record(record) + return [] if record.nil? || record.empty? + parts = record.split(" ") + parse_line_v2(parts) || parse_line_v1(parts) + end + + private + + def parse_line_v2(parts) + # new format e.g. 2977152 1 4 94 1519319662.852678 'path/to/file' + # do we want to store the last known state of the watched file too? + return false if parts.size < 5 + inode_struct = prepare_inode_struct(parts) + pos = parts.shift.to_i + expires_at = Float(parts.shift) # this is like Time.now.to_f + path_in_sincedb = parts.join(" ") + value = SincedbValue.new(pos, expires_at).add_path_in_sincedb(path_in_sincedb) + [inode_struct, value] + end + + def parse_line_v1(parts) + # old inode based e.g. 2977152 1 4 94 + inode_struct = prepare_inode_struct(parts) + pos = parts.shift.to_i + [inode_struct, SincedbValue.new(pos)] + end + + def prepare_inode_struct(parts) + InodeStruct.new(parts.shift, *parts.shift(2).map(&:to_i)) + end + end +end diff --git a/lib/filewatch/sincedb_value.rb b/lib/filewatch/sincedb_value.rb new file mode 100644 index 00000000..56ac4840 --- /dev/null +++ b/lib/filewatch/sincedb_value.rb @@ -0,0 +1,92 @@ +# encoding: utf-8 + +module FileWatch + # Tracks the position and expiry of the offset of a file-of-interest + # NOTE: the `watched_file.bytes_read` and this `sincedb_value.position` can diverge + # At any given moment IF the `watched_file.bytes_read` is greater than `sincedb_value.position` + # then it is larger to account for bytes held in the `watched_file.buffer` + # in Tail mode if we quit the buffer is not flushed and we restart from + # the `sincedb_value.position` (end of the last line read). + # in Read mode the buffer is flushed as a line and both values should be the same. + class SincedbValue + attr_reader :last_changed_at, :watched_file, :path_in_sincedb, :position + + def initialize(position, last_changed_at = nil, watched_file = nil) + @position = position # this is the value read from disk + @last_changed_at = last_changed_at + @watched_file = watched_file + touch if @last_changed_at.nil? || @last_changed_at.zero? + end + + def add_path_in_sincedb(path) + @path_in_sincedb = path # can be nil + self + end + + def last_changed_at_expires(duration) + @last_changed_at + duration + end + + def update_position(pos) + # called when we reset the position to bof or eof on shrink or file read complete + touch + @position = pos + @watched_file.update_bytes_read(pos) unless @watched_file.nil? + end + + def increment_position(pos) + # called when actual lines are sent to the observer listener + # this gets serialized as its a more true indication of position than + # chunk read size + touch + @position += pos + end + + def set_watched_file(watched_file) + touch + @watched_file = watched_file + end + + def touch + @last_changed_at = Time.now.to_f + end + + def to_s + # consider serializing the watched_file state as well + "#{position} #{last_changed_at}".tap do |s| + if @watched_file.nil? + s.concat(" ").concat(@path_in_sincedb) unless @path_in_sincedb.nil? + else + s.concat(" ").concat(@watched_file.path) + end + end + end + + def clear_watched_file + @watched_file = nil + end + + def reading_completed + touch + @path_in_sincedb = @watched_file.path + @position = @watched_file.bytes_read + end + + def unset_watched_file + # called in read mode only because we flushed any remaining bytes as a final line. + # cache the position + # we don't cache the path here because we know we are done with this file. + # either due via the `delete` handling + # or when read mode is done with a file. + # in the case of `delete` if the file was renamed then @watched_file is the + # watched_file of the previous path and the new path will be discovered and + # it should have the same inode as before. + # The key from the new watched_file should then locate this entry and we + # can resume from the cached position + return if @watched_file.nil? + wf = @watched_file + @watched_file = nil + @position = wf.bytes_read + end + end +end diff --git a/lib/filewatch/stat/generic.rb b/lib/filewatch/stat/generic.rb new file mode 100644 index 00000000..0ecaaafe --- /dev/null +++ b/lib/filewatch/stat/generic.rb @@ -0,0 +1,29 @@ +# encoding: utf-8 + +module FileWatch module Stat + class Generic + + attr_reader :inode, :modified_at, :size, :inode_struct + + def initialize(source) + @source = source # Pathname + restat + end + + def restat + stat = @source.stat + @inode = stat.ino.to_s + @modified_at = stat.mtime.to_f + @size = stat.size + @inode_struct = InodeStruct.new(@inode, stat.dev_major, stat.dev_minor) + end + + def windows? + false + end + + def inspect + "<#{self.class.name} size=#{@size}, modified_at=#{@modified_at}, inode='#{@inode}', inode_struct=#{@inode_struct}>" + end + end +end end diff --git a/lib/filewatch/stat/windows_path.rb b/lib/filewatch/stat/windows_path.rb new file mode 100644 index 00000000..68b1cda2 --- /dev/null +++ b/lib/filewatch/stat/windows_path.rb @@ -0,0 +1,30 @@ +# encoding: utf-8 + +module FileWatch module Stat + class WindowsPath + + attr_reader :inode, :modified_at, :size, :inode_struct + + def initialize(source) + @source = source # Pathname + @inode = Winhelper.identifier_from_path(@source.to_path) + # in windows the dev hi and low are in the identifier + @inode_struct = InodeStruct.new(@inode, 0, 0) + restat + end + + def restat + stat = @source.stat + @modified_at = stat.mtime.to_f + @size = stat.size + end + + def windows? + true + end + + def inspect + "<#{self.class.name} size=#{@size}, modified_at=#{@modified_at}, inode=#{@inode}, inode_struct=#{@inode_struct}>" + end + end +end end diff --git a/lib/filewatch/tail_mode/handlers/base.rb b/lib/filewatch/tail_mode/handlers/base.rb new file mode 100644 index 00000000..4a6e7c26 --- /dev/null +++ b/lib/filewatch/tail_mode/handlers/base.rb @@ -0,0 +1,166 @@ +# encoding: utf-8 +require "logstash/util/loggable" + +module FileWatch module TailMode module Handlers + class Base + include LogStash::Util::Loggable + attr_reader :sincedb_collection + + def initialize(processor, sincedb_collection, observer, settings) + @settings = settings + @processor = processor + @sincedb_collection = sincedb_collection + @observer = observer + end + + def quit? + @processor.watch.quit? + end + + def handle(watched_file) + logger.trace? && logger.trace("handling:", :path => watched_file.path) + unless watched_file.has_listener? + watched_file.set_listener(@observer) + end + handle_specifically(watched_file) + end + + def handle_specifically(watched_file) + # some handlers don't need to define this method + end + + def update_existing_specifically(watched_file, sincedb_value) + # when a handler subclass does not implement this then do nothing + end + + private + + def controlled_read(watched_file, loop_control) + changed = false + logger.trace? && logger.trace(__method__.to_s, :iterations => loop_control.count, :amount => loop_control.size, :filename => watched_file.filename) + # from a real config (has 102 file inputs) + # -- This cfg creates a file input for every log file to create a dedicated file pointer and read all file simultaneously + # -- If we put all log files in one file input glob we will have indexing delay, because Logstash waits until the first file becomes EOF + # by allowing the user to specify a combo of `file_chunk_count` X `file_chunk_size`... + # we enable the pseudo parallel processing of each file. + # user also has the option to specify a low `stat_interval` and a very high `discover_interval`to respond + # quicker to changing files and not allowing too much content to build up before reading it. + loop_control.count.times do + break if quit? + begin + logger.debug? && logger.debug("#{__method__} get chunk") + result = watched_file.read_extract_lines(loop_control.size) # expect BufferExtractResult + logger.trace(result.warning, result.additional) unless result.warning.empty? + changed = true + result.lines.each do |line| + watched_file.listener.accept(line) + # sincedb position is now independent from the watched_file bytes_read + sincedb_collection.increment(watched_file.sincedb_key, line.bytesize + @settings.delimiter_byte_size) + end + rescue EOFError => e + # it only makes sense to signal EOF in "read" mode not "tail" + logger.debug(__method__.to_s, exception_details(watched_file.path, e, false)) + loop_control.flag_read_error + break + rescue Errno::EWOULDBLOCK, Errno::EINTR => e + logger.debug(__method__.to_s, exception_details(watched_file.path, e, false)) + watched_file.listener.error + loop_control.flag_read_error + break + rescue => e + logger.error("#{__method__} general error reading", exception_details(watched_file.path, e)) + watched_file.listener.error + loop_control.flag_read_error + break + end + end + logger.debug("#{__method__} stopped loop due quit") if quit? + sincedb_collection.request_disk_flush if changed + end + + def open_file(watched_file) + return true if watched_file.file_open? + logger.trace? && logger.trace("open_file", :filename => watched_file.filename) + begin + watched_file.open + rescue => e + # don't emit this message too often. if a file that we can't + # read is changing a lot, we'll try to open it more often, and spam the logs. + now = Time.now.to_i + logger.trace? && logger.trace("open_file OPEN_WARN_INTERVAL is '#{OPEN_WARN_INTERVAL}'") + if watched_file.last_open_warning_at.nil? || now - watched_file.last_open_warning_at > OPEN_WARN_INTERVAL + logger.warn("failed to open file", exception_details(watched_file.path, e)) + watched_file.last_open_warning_at = now + else + logger.debug("open_file suppressed warning `failed to open file`", exception_details(watched_file.path, e, false)) + end + watched_file.watch # set it back to watch so we can try it again + else + watched_file.listener.opened + end + watched_file.file_open? + end + + def add_or_update_sincedb_collection(watched_file) + sincedb_value = @sincedb_collection.find(watched_file) + if sincedb_value.nil? + sincedb_value = add_new_value_sincedb_collection(watched_file) + watched_file.initial_completed + elsif sincedb_value.watched_file == watched_file + update_existing_sincedb_collection_value(watched_file, sincedb_value) + watched_file.initial_completed + else + logger.trace? && logger.trace("add_or_update_sincedb_collection: found sincedb record", + :sincedb_key => watched_file.sincedb_key, :sincedb_value => sincedb_value) + # detected a rotation, Discoverer can't handle this because this watched file is not a new discovery. + # we must handle it here, by transferring state and have the sincedb value track this watched file + # rotate_as_file and rotate_from will switch the sincedb key to the inode that the path is now pointing to + # and pickup the sincedb_value from before. + logger.debug("add_or_update_sincedb_collection: the found sincedb_value has a watched_file - this is a rename, switching inode to this watched file") + existing_watched_file = sincedb_value.watched_file + if existing_watched_file.nil? + sincedb_value.set_watched_file(watched_file) + logger.trace? && logger.trace("add_or_update_sincedb_collection: switching as new file") + watched_file.rotate_as_file + watched_file.update_bytes_read(sincedb_value.position) + else + sincedb_value.set_watched_file(watched_file) + logger.trace? && logger.trace("add_or_update_sincedb_collection: switching from:", :watched_file => watched_file.details) + watched_file.rotate_from(existing_watched_file) + end + end + sincedb_value + end + + def update_existing_sincedb_collection_value(watched_file, sincedb_value) + logger.trace? && logger.trace("update_existing_sincedb_collection_value", :position => sincedb_value.position, + :filename => watched_file.filename, :last_stat_size => watched_file.last_stat_size) + update_existing_specifically(watched_file, sincedb_value) + end + + def add_new_value_sincedb_collection(watched_file) + sincedb_value = get_new_value_specifically(watched_file) + logger.trace? && logger.trace("add_new_value_sincedb_collection", :position => sincedb_value.position, + :watched_file => watched_file.details) + sincedb_collection.set(watched_file.sincedb_key, sincedb_value) + sincedb_value + end + + def get_new_value_specifically(watched_file) + position = watched_file.position_for_new_sincedb_value + value = SincedbValue.new(position) + value.set_watched_file(watched_file) + watched_file.update_bytes_read(position) + value + end + + private + + def exception_details(path, e, trace = true) + details = { :path => path, :exception => e.class, :message => e.message } + details[:backtrace] = e.backtrace if trace && logger.debug? + details + end + + end +end end end diff --git a/lib/filewatch/tail_mode/handlers/create.rb b/lib/filewatch/tail_mode/handlers/create.rb new file mode 100644 index 00000000..167bc2c9 --- /dev/null +++ b/lib/filewatch/tail_mode/handlers/create.rb @@ -0,0 +1,16 @@ +# encoding: utf-8 + +module FileWatch module TailMode module Handlers + class Create < Base + def handle_specifically(watched_file) + if open_file(watched_file) + add_or_update_sincedb_collection(watched_file) unless sincedb_collection.member?(watched_file.sincedb_key) + end + end + + def update_existing_specifically(watched_file, sincedb_value) + # sincedb_value is the source of truth + watched_file.update_bytes_read(sincedb_value.position) + end + end +end end end diff --git a/lib/filewatch/tail_mode/handlers/create_initial.rb b/lib/filewatch/tail_mode/handlers/create_initial.rb new file mode 100644 index 00000000..8e01f9d2 --- /dev/null +++ b/lib/filewatch/tail_mode/handlers/create_initial.rb @@ -0,0 +1,22 @@ +# encoding: utf-8 + +module FileWatch module TailMode module Handlers + class CreateInitial < Base + def handle_specifically(watched_file) + if open_file(watched_file) + logger.trace("handle_specifically opened file handle: #{watched_file.file.fileno}, path: #{watched_file.filename}") + add_or_update_sincedb_collection(watched_file) + end + end + + def update_existing_specifically(watched_file, sincedb_value) + position = watched_file.last_stat_size + if @settings.start_new_files_at == :beginning + position = 0 + end + logger.trace("update_existing_specifically - #{watched_file.path}: seeking to #{position}") + watched_file.update_bytes_read(position) + sincedb_value.update_position(position) + end + end +end end end diff --git a/lib/filewatch/tail_mode/handlers/delete.rb b/lib/filewatch/tail_mode/handlers/delete.rb new file mode 100644 index 00000000..5d203db9 --- /dev/null +++ b/lib/filewatch/tail_mode/handlers/delete.rb @@ -0,0 +1,21 @@ +# encoding: utf-8 + +module FileWatch module TailMode module Handlers + class Delete < Base + DATA_LOSS_WARNING = "watched file path was deleted or rotated before all content was read, if the file is found again it will be read from the last position" + def handle_specifically(watched_file) + # TODO consider trying to find the renamed file - it will have the same inode. + # Needs a rotate scheme rename hint from user e.g. "-YYYY-MM-DD-N." or "..N" + # send the found content to the same listener (stream identity) + logger.trace? && logger.trace(__method__.to_s, :path => watched_file.path, :watched_file => watched_file.details) + if watched_file.bytes_unread > 0 + logger.warn(DATA_LOSS_WARNING, :path => watched_file.path, :unread_bytes => watched_file.bytes_unread) + end + watched_file.listener.deleted + # no need to worry about data in the buffer + # if found it will be associated by inode and read from last position + sincedb_collection.watched_file_deleted(watched_file) + watched_file.file_close + end + end +end end end diff --git a/lib/filewatch/tail_mode/handlers/grow.rb b/lib/filewatch/tail_mode/handlers/grow.rb new file mode 100644 index 00000000..7cc2d268 --- /dev/null +++ b/lib/filewatch/tail_mode/handlers/grow.rb @@ -0,0 +1,15 @@ +# encoding: utf-8 + +module FileWatch module TailMode module Handlers + class Grow < Base + def handle_specifically(watched_file) + watched_file.file_seek(watched_file.bytes_read) + loop do + break if quit? + loop_control = watched_file.loop_control_adjusted_for_stat_size + controlled_read(watched_file, loop_control) + break unless loop_control.keep_looping? + end + end + end +end end end diff --git a/lib/filewatch/tail_mode/handlers/shrink.rb b/lib/filewatch/tail_mode/handlers/shrink.rb new file mode 100644 index 00000000..be9da9e9 --- /dev/null +++ b/lib/filewatch/tail_mode/handlers/shrink.rb @@ -0,0 +1,23 @@ +# encoding: utf-8 + +module FileWatch module TailMode module Handlers + class Shrink < Base + def handle_specifically(watched_file) + add_or_update_sincedb_collection(watched_file) + watched_file.file_seek(watched_file.bytes_read) + loop do + break if quit? + loop_control = watched_file.loop_control_adjusted_for_stat_size + controlled_read(watched_file, loop_control) + break unless loop_control.keep_looping? + end + end + + def update_existing_specifically(watched_file, sincedb_value) + # we have a match but size is smaller - set all to zero + watched_file.reset_bytes_unread + sincedb_value.update_position(0) + logger.trace? && logger.trace("update_existing_specifically: was truncated seeking to beginning", :watched_file => watched_file.details, :sincedb_value => sincedb_value) + end + end +end end end diff --git a/lib/filewatch/tail_mode/handlers/timeout.rb b/lib/filewatch/tail_mode/handlers/timeout.rb new file mode 100644 index 00000000..248eee06 --- /dev/null +++ b/lib/filewatch/tail_mode/handlers/timeout.rb @@ -0,0 +1,10 @@ +# encoding: utf-8 + +module FileWatch module TailMode module Handlers + class Timeout < Base + def handle_specifically(watched_file) + watched_file.listener.timed_out + watched_file.file_close + end + end +end end end diff --git a/lib/filewatch/tail_mode/handlers/unignore.rb b/lib/filewatch/tail_mode/handlers/unignore.rb new file mode 100644 index 00000000..b59118be --- /dev/null +++ b/lib/filewatch/tail_mode/handlers/unignore.rb @@ -0,0 +1,39 @@ +# encoding: utf-8 + +module FileWatch module TailMode module Handlers + class Unignore < Base + # a watched file can be put straight into the ignored state + # before any other handling has been done + # at a minimum we create or associate a sincedb value + def handle_specifically(watched_file) + add_or_update_sincedb_collection(watched_file) + end + + def get_new_value_specifically(watched_file) + # for file initially ignored their bytes_read was set to stat.size + # use this value not the `start_new_files_at` for the position + # logger.trace("get_new_value_specifically", "watched_file" => watched_file.inspect) + SincedbValue.new(watched_file.bytes_read).tap do |sincedb_value| + sincedb_value.set_watched_file(watched_file) + logger.trace? && logger.trace("get_new_value_specifically: unignore", :watched_file => watched_file.details, :sincedb_value => sincedb_value) + end + end + + def update_existing_specifically(watched_file, sincedb_value) + # when this watched_file was ignored it had it bytes_read set to eof + # now the file has changed (watched_file.size_changed?) + # it has been put into the watched state so when it becomes active + # we will handle grow or shrink + # for now we seek to where we were before the file got ignored (grow) + # or to the start (shrink) + logger.trace? && logger.trace("update_existing_specifically: unignore", :watched_file => watched_file.details, :sincedb_value => sincedb_value) + position = 0 + if watched_file.shrunk? + watched_file.update_bytes_read(0) + else + position = watched_file.bytes_read + end + sincedb_value.update_position(position) + end + end +end end end diff --git a/lib/filewatch/tail_mode/processor.rb b/lib/filewatch/tail_mode/processor.rb new file mode 100644 index 00000000..6634dc91 --- /dev/null +++ b/lib/filewatch/tail_mode/processor.rb @@ -0,0 +1,293 @@ +# encoding: utf-8 +require 'filewatch/processor' +require_relative "handlers/base" +require_relative "handlers/create_initial" +require_relative "handlers/create" +require_relative "handlers/delete" +require_relative "handlers/grow" +require_relative "handlers/shrink" +require_relative "handlers/timeout" +require_relative "handlers/unignore" + +module FileWatch module TailMode + # Must handle + # :create_initial - file is discovered and we have no record of it in the sincedb + # :create - file is discovered and we have seen it before in the sincedb + # :grow - file has more content + # :shrink - file has less content + # :delete - file can't be read + # :timeout - file is closable + # :unignore - file was ignored, but have now received new content + class Processor < FileWatch::Processor + + def initialize_handlers(sincedb_collection, observer) + @sincedb_collection = sincedb_collection + @create_initial = Handlers::CreateInitial.new(self, sincedb_collection, observer, @settings) + @create = Handlers::Create.new(self, sincedb_collection, observer, @settings) + @grow = Handlers::Grow.new(self, sincedb_collection, observer, @settings) + @shrink = Handlers::Shrink.new(self, sincedb_collection, observer, @settings) + @delete = Handlers::Delete.new(self, sincedb_collection, observer, @settings) + @timeout = Handlers::Timeout.new(self, sincedb_collection, observer, @settings) + @unignore = Handlers::Unignore.new(self, sincedb_collection, observer, @settings) + end + + def create(watched_file) + @create.handle(watched_file) + end + + def create_initial(watched_file) + @create_initial.handle(watched_file) + end + + def grow(watched_file) + @grow.handle(watched_file) + end + + def shrink(watched_file) + @shrink.handle(watched_file) + end + + def delete(watched_file) + @delete.handle(watched_file) + end + + def timeout(watched_file) + @timeout.handle(watched_file) + end + + def unignore(watched_file) + @unignore.handle(watched_file) + end + + def process_all_states(watched_files) + process_closed(watched_files) + return if watch.quit? + process_ignored(watched_files) + return if watch.quit? + process_delayed_delete(watched_files) + return if watch.quit? + process_restat_for_watched_and_active(watched_files) + return if watch.quit? + process_rotation_in_progress(watched_files) + return if watch.quit? + process_watched(watched_files) + return if watch.quit? + process_active(watched_files) + end + + private + + def process_closed(watched_files) + logger.trace(__method__.to_s) + # Handles watched_files in the closed state. + # if its size changed it is put into the watched state + watched_files.each do |watched_file| + next unless watched_file.closed? + common_restat_with_delay(watched_file, __method__) do + # it won't do this if rotation is detected + if watched_file.size_changed? + # if the closed file changed, move it to the watched state + # not to active state because we want to respect the active files window. + watched_file.watch + end + end + break if watch.quit? + end + end + + def process_ignored(watched_files) + logger.trace(__method__.to_s) + # Handles watched_files in the ignored state. + # if its size changed: + # put it in the watched state + # invoke unignore + watched_files.each do |watched_file| + next unless watched_file.ignored? + common_restat_with_delay(watched_file, __method__) do + # it won't do this if rotation is detected + if watched_file.size_changed? + watched_file.watch + unignore(watched_file) + end + end + break if watch.quit? + end + end + + def process_delayed_delete(watched_files) + # defer the delete to one loop later to ensure that the stat really really can't find a renamed file + # because a `stat` can be called right in the middle of the rotation rename cascade + logger.trace(__method__.to_s) + watched_files.each do |watched_file| + next unless watched_file.delayed_delete? + logger.trace(">>> Delayed Delete", :path => watched_file.path) + common_restat_without_delay(watched_file, __method__) do + logger.trace(">>> Delayed Delete: file at path found again", :watched_file => watched_file.details) + watched_file.file_at_path_found_again + end + end + end + + def process_restat_for_watched_and_active(watched_files) + # do restat on all watched and active states once now. closed and ignored have been handled already + logger.trace(__method__.to_s) + watched_files.each do |watched_file| + next if !watched_file.watched? && !watched_file.active? + common_restat_with_delay(watched_file, __method__) + end + end + + def process_rotation_in_progress(watched_files) + logger.trace(__method__.to_s) + watched_files.each do |watched_file| + next unless watched_file.rotation_in_progress? + if !watched_file.all_read? + if watched_file.file_open? + # rotated file but original opened file is not fully read + # we need to keep reading the open file, if we close it we lose it because the path is now pointing at a different file. + logger.trace(">>> Rotation In Progress - inode change detected and original content is not fully read, reading all", :watched_file => watched_file.details) + # need to fully read open file while we can + watched_file.set_maximum_read_loop + grow(watched_file) + watched_file.set_standard_read_loop + else + logger.warn(">>> Rotation In Progress - inode change detected and original content is not fully read, file is closed and path points to new content", :watched_file => watched_file.details) + end + end + current_key = watched_file.sincedb_key + sdb_value = @sincedb_collection.get(current_key) + potential_key = watched_file.stat_sincedb_key + potential_sdb_value = @sincedb_collection.get(potential_key) + logger.trace(">>> Rotation In Progress", :watched_file => watched_file.details, :found_sdb_value => sdb_value, :potential_key => potential_key, :potential_sdb_value => potential_sdb_value) + if potential_sdb_value.nil? + logger.trace("---------- >>>> Rotation In Progress: rotating as existing file") + watched_file.rotate_as_file + trace_message = "---------- >>>> Rotation In Progress: no potential sincedb value " + if sdb_value.nil? + trace_message.concat("AND no found sincedb value") + else + trace_message.concat("BUT found sincedb value") + sdb_value.clear_watched_file + end + logger.trace(trace_message) + new_sdb_value = SincedbValue.new(0) + new_sdb_value.set_watched_file(watched_file) + @sincedb_collection.set(potential_key, new_sdb_value) + else + other_watched_file = potential_sdb_value.watched_file + if other_watched_file.nil? + logger.trace("---------- >>>> Rotation In Progress: rotating as existing file WITH potential sincedb value that does not have a watched file reference !!!!!!!!!!!!!!!!!") + watched_file.rotate_as_file(potential_sdb_value.position) + sdb_value.clear_watched_file unless sdb_value.nil? + potential_sdb_value.set_watched_file(watched_file) + else + logger.trace("---------- >>>> Rotation In Progress: rotating from...", :this_watched_file => watched_file.details, :other_watched_file => other_watched_file.details) + watched_file.rotate_from(other_watched_file) + sdb_value.clear_watched_file unless sdb_value.nil? + potential_sdb_value.set_watched_file(watched_file) + end + end + logger.trace("---------- >>>> Rotation In Progress: after handling rotation", :this_watched_file => watched_file.details, :sincedb_value => (potential_sdb_value || sdb_value)) + end + end + + def process_watched(watched_files) + # Handles watched_files in the watched state. + # for a slice of them: + # move to the active state + # and we allow the block to open the file and create a sincedb collection record if needed + # some have never been active and some have + # those that were active before but are watched now were closed under constraint + logger.trace(__method__.to_s) + # how much of the max active window is available + to_take = @settings.max_active - watched_files.count(&:active?) + if to_take > 0 + watched_files.select(&:watched?).take(to_take).each do |watched_file| + watched_file.activate + if watched_file.initial? + create_initial(watched_file) + else + create(watched_file) + end + break if watch.quit? + end + else + now = Time.now.to_i + if (now - watch.lastwarn_max_files) > MAX_FILES_WARN_INTERVAL + waiting = watched_files.size - @settings.max_active + logger.warn("#{@settings.max_warn_msg}, files yet to open: #{waiting}") + watch.lastwarn_max_files = now + end + end + end + + def process_active(watched_files) + logger.trace(__method__.to_s) + # Handles watched_files in the active state. + # files have been opened at this point + watched_files.each do |watched_file| + next unless watched_file.active? + break if watch.quit? + path = watched_file.filename + if watched_file.grown? + logger.trace("#{__method__} file grew: new size is #{watched_file.last_stat_size}, bytes read #{watched_file.bytes_read}", :path => path) + grow(watched_file) + elsif watched_file.shrunk? + if watched_file.bytes_unread > 0 + logger.warn("potential data loss, file truncate detected with #{watched_file.bytes_unread} unread bytes", :path => path) + end + # we don't update the size here, its updated when we actually read + logger.trace("#{__method__} file shrunk: new size is #{watched_file.last_stat_size}, old size #{watched_file.bytes_read}", :path => path) + shrink(watched_file) + else + # same size, do nothing + logger.trace("#{__method__} no change", :path => path) + end + # can any active files be closed to make way for waiting files? + if watched_file.file_closable? + logger.trace("#{__method__} file expired", :path => path) + timeout(watched_file) + watched_file.close + end + end + end + + def common_restat_with_delay(watched_file, action, &block) + common_restat(watched_file, action, true, &block) + end + + def common_restat_without_delay(watched_file, action, &block) + common_restat(watched_file, action, false, &block) + end + + def common_restat(watched_file, action, delay, &block) + all_ok = true + begin + restat(watched_file) + if watched_file.rotation_in_progress? + logger.trace("-------------------- >>>>> restat - rotation_detected", :watched_file => watched_file.details, :new_sincedb_key => watched_file.stat_sincedb_key) + # don't yield to closed and ignore processing + else + yield if block_given? + end + rescue Errno::ENOENT + if delay + logger.trace("#{action} - delaying the stat fail on", :filename => watched_file.filename) + watched_file.delay_delete + else + # file has gone away or we can't read it anymore. + logger.trace("#{action} - after a delay, really can't find this file", :path => watched_file.path) + watched_file.unwatch + logger.trace("#{action} - removing from collection", :filename => watched_file.filename) + delete(watched_file) + add_deletable_path watched_file.path + all_ok = false + end + rescue => e + logger.error("#{action} - other error", error_details(e, watched_file)) + all_ok = false + end + all_ok + end + end +end end diff --git a/lib/filewatch/watch.rb b/lib/filewatch/watch.rb new file mode 100644 index 00000000..8bee1208 --- /dev/null +++ b/lib/filewatch/watch.rb @@ -0,0 +1,90 @@ +# encoding: utf-8 +require "logstash/util/loggable" +require "concurrent/atomic/atomic_boolean" + +module FileWatch + class Watch + include LogStash::Util::Loggable + + attr_accessor :lastwarn_max_files + attr_reader :discoverer, :processor, :watched_files_collection + + def initialize(discoverer, processor, settings) + @discoverer = discoverer + @watched_files_collection = discoverer.watched_files_collection + @settings = settings + + # we need to be threadsafe about the quit mutation + @quit = Concurrent::AtomicBoolean.new(false) + @lastwarn_max_files = 0 + + @processor = processor + @processor.add_watch(self) + end + + def watch(path) + @discoverer.add_path(path) + # don't return whatever @discoverer.add_path returns + return true + end + + def discover + @discoverer.discover + # don't return whatever @discoverer.discover returns + return true + end + + def subscribe(observer, sincedb_collection) + @processor.initialize_handlers(sincedb_collection, observer) + + glob = 0 + interval = @settings.discover_interval + reset_quit + until quit? + iterate_on_state + # Don't discover new files when files to read are known at the beginning + break if quit? + sincedb_collection.write_if_requested + glob += 1 + if glob == interval && !@settings.exit_after_read + discover + glob = 0 + end + break if quit? + # NOTE: maybe the plugin should validate stat_interval <= sincedb_write_interval <= sincedb_clean_after + sleep(@settings.stat_interval) + # we need to check potential expired keys (sincedb_clean_after) periodically + sincedb_collection.flush_at_interval + end + sincedb_collection.write_if_requested # does nothing if no requests to write were lodged. + @watched_files_collection.close_all + end # def subscribe + + # Read mode processor will handle watched_files in the closed, ignored, watched and active state + # differently from Tail mode - see the ReadMode::Processor and TailMode::Processor + def iterate_on_state + return if @watched_files_collection.empty? + begin + # creates this snapshot of watched_file values just once + watched_files = @watched_files_collection.values + @processor.process_all_states(watched_files) + ensure + @watched_files_collection.remove_paths(@processor.clear_deletable_paths) + end + end + + def quit + @quit.make_true + end + + def quit? + @quit.true? || (@settings.exit_after_read && @watched_files_collection.empty?) + end + + private + + def reset_quit + @quit.make_false + end + end +end diff --git a/lib/filewatch/watched_file.rb b/lib/filewatch/watched_file.rb new file mode 100644 index 00000000..5d6876f3 --- /dev/null +++ b/lib/filewatch/watched_file.rb @@ -0,0 +1,440 @@ +# encoding: utf-8 + +module FileWatch + class WatchedFile + PATH_BASED_STAT = 0 + IO_BASED_STAT = 1 + + attr_reader :bytes_read, :state, :file, :buffer, :recent_states, :bytes_unread + attr_reader :path, :accessed_at, :pathname, :filename + attr_reader :listener, :read_loop_count, :read_chunk_size, :stat + attr_reader :loop_count_type, :loop_count_mode + attr_accessor :last_open_warning_at + + # this class represents a file that has been discovered + # path based stat is taken at discovery + def initialize(pathname, stat, settings) + @settings = settings + @pathname = Pathname.new(pathname) # given arg pathname might be a string or a Pathname object + @path = @pathname.to_path.freeze + @filename = @pathname.basename.to_s + full_state_reset(stat) + watch + set_standard_read_loop + set_accessed_at + end + + def full_state_reset(this_stat = nil) + if this_stat.nil? + begin + this_stat = PathStatClass.new(pathname) + rescue Errno::ENOENT + delay_delete + return + end + end + @bytes_read = 0 # tracks bytes read from the open file or initialized from a matched sincedb_value off disk. + @bytes_unread = 0 # tracks bytes not yet read from the open file. So we can warn on shrink when unread bytes are seen. + file_close + set_stat(this_stat) + @listener = nil + @last_open_warning_at = nil + # initial as true means we have not associated this watched_file with a previous sincedb value yet. + # and we should read from the beginning if necessary + @initial = true + @recent_states = [] # keep last 8 states, managed in set_state + # the prepare_inode method is sourced from the mixed module above + watch if active? || @state.nil? + end + + def rotate_from(other) + # move all state from other to this one + set_standard_read_loop + file_close + @bytes_read = other.bytes_read + @bytes_unread = other.bytes_unread + @listener = nil + @initial = false + @recent_states = other.recent_states + @accessed_at = other.accessed_at + if !other.delayed_delete? + # we don't know if a file exists at the other.path yet + # so no reset + other.full_state_reset + end + set_stat PathStatClass.new(pathname) + ignore + end + + def set_stat(stat) + @stat = stat + @size = @stat.size + @sdb_key_v1 = @stat.inode_struct + end + private :set_stat + + def rotate_as_file(bytes_read = 0) + # rotation, when a sincedb record exists for new inode, but no watched file to rotate from + # probably caused by a deletion detected in the middle of the rename cascade + # RARE due to delayed_delete - there would have to be a large time span between the renames. + @bytes_read = bytes_read # tracks bytes read from the open file or initialized from a matched sincedb_value off disk. + @bytes_unread = 0 # tracks bytes not yet read from the open file. So we can warn on shrink when unread bytes are seen. + @last_open_warning_at = nil + # initial as true means we have not associated this watched_file with a previous sincedb value yet. + # and we should read from the beginning if necessary + @initial = false + @recent_states = [] # keep last 8 states, managed in set_state + set_stat(PathStatClass.new(pathname)) + reopen + watch + end + + def stat_sincedb_key + @stat.inode_struct + end + + def rotation_detected? + stat_sincedb_key != sincedb_key + end + + # @return true if the file was modified since last stat + def restat! + modified_at # to always be able to detect changes + @stat.restat + if rotation_detected? + # switch to new state now + rotation_in_progress + return true + else + @size = @stat.size + update_bytes_unread + modified_at_changed? + end + end + + def modified_at(update = false) + if update || @modified_at.nil? + @modified_at = @stat.modified_at + else + @modified_at + end + end + + # @return whether modified_at changed since it was last read + # @see #restat! + def modified_at_changed? + modified_at != @stat.modified_at + end + + def position_for_new_sincedb_value + if @initial + # this file was found in first discovery + @settings.start_new_files_at == :beginning ? 0 : last_stat_size + else + # always start at the beginning if found after first discovery + 0 + end + end + + def last_stat_size + @stat.size + end + + def current_size + @size + end + + def shrunk? + @size < @bytes_read + end + + def grown? + @size > @bytes_read + end + + def size_changed? + # called from closed and ignored + # before the last stat was taken file should be fully read. + @size != @bytes_read + end + + def all_read? + @bytes_read >= @size + end + + def file_at_path_found_again + restore_previous_state + end + + def set_listener(observer) + @listener = observer.listener_for(@path) + end + + def unset_listener + @listener = nil + end + + def has_listener? + !@listener.nil? + end + + def sincedb_key + @sdb_key_v1 + end + + def initial_completed + @initial = false + end + + def set_accessed_at + @accessed_at = Time.now.to_f + end + + def initial? + @initial + end + + def compressed? + @path.end_with?('.gz','.gzip') + end + + def reopen + if file_open? + file_close + open + end + end + + def open + file_add_opened(FileOpener.open(@path)) + end + + def file_add_opened(rubyfile) + @file = rubyfile + @buffer = BufferedTokenizer.new(@settings.delimiter) if @buffer.nil? + end + + def file_close + return if @file.nil? || @file.closed? + @file.close + @file = nil + end + + def file_seek(amount, whence = IO::SEEK_SET) + @file.sysseek(amount, whence) + end + + def file_read(amount = nil) + set_accessed_at + @file.sysread(amount || @read_chunk_size) + end + + def file_open? + !@file.nil? && !@file.closed? + end + + def reset_buffer + @buffer.flush + end + + def read_extract_lines(amount) + data = file_read(amount) + result = buffer_extract(data) + increment_bytes_read(data.bytesize) + result + end + + def buffer_extract(data) + warning, additional = "", {} + lines = @buffer.extract(data) + if lines.empty? + warning.concat("buffer_extract: a delimiter can't be found in current chunk") + warning.concat(", maybe there are no more delimiters or the delimiter is incorrect") + warning.concat(" or the text before the delimiter, a 'line', is very large") + warning.concat(", if this message is logged often try increasing the `file_chunk_size` setting.") + additional["delimiter"] = @settings.delimiter + additional["read_position"] = @bytes_read + additional["bytes_read_count"] = data.bytesize + additional["last_known_file_size"] = last_stat_size + additional["file_path"] = @path + end + BufferExtractResult.new(lines, warning, additional) + end + + def increment_bytes_read(delta) + return if delta.nil? + @bytes_read += delta + update_bytes_unread + @bytes_read + end + + def update_bytes_read(total_bytes_read) + return if total_bytes_read.nil? + @bytes_read = total_bytes_read + update_bytes_unread + @bytes_read + end + + def rotation_in_progress + set_state :rotation_in_progress + end + + def activate + set_state :active + end + + def ignore + set_state :ignored + end + + def ignore_as_unread + ignore + @bytes_read = @size + end + + def close + set_state :closed + end + + def watch + set_state :watched + end + + def unwatch + set_state :unwatched + end + + def delay_delete + set_state :delayed_delete + end + + def restore_previous_state + set_state @recent_states.pop + end + + def rotation_in_progress? + @state == :rotation_in_progress + end + + def active? + @state == :active + end + + def delayed_delete? + @state == :delayed_delete + end + + def ignored? + @state == :ignored + end + + def closed? + @state == :closed + end + + def watched? + @state == :watched + end + + def unwatched? + @state == :unwatched + end + + def expiry_close_enabled? + !@settings.close_older.nil? + end + + def expiry_ignore_enabled? + !@settings.ignore_older.nil? + end + + def set_standard_read_loop + @read_loop_count = @settings.file_chunk_count + @read_chunk_size = @settings.file_chunk_size + # e.g. 1 * 10 bytes -> 10 or 256 * 65536 -> 1677716 or 140737488355327 * 32768 -> 4611686018427355136 + @standard_loop_max_bytes = @read_loop_count * @read_chunk_size + end + + def set_maximum_read_loop + # used to quickly fully read an open file when rotation is detected + @read_loop_count = FileWatch::MAX_ITERATIONS + @read_chunk_size = FileWatch::FILE_READ_SIZE + @standard_loop_max_bytes = @read_loop_count * @read_chunk_size + end + + def loop_control_adjusted_for_stat_size + more = false + to_read = current_size - @bytes_read + return LoopControlResult.new(0, 0, more) if to_read < 1 + return LoopControlResult.new(1, to_read, more) if to_read < @read_chunk_size + # set as if to_read is greater than or equal to max_bytes + # use the ones from settings and don't indicate more + count = @read_loop_count + if to_read < @standard_loop_max_bytes + # if the defaults are used then this branch will be taken + # e.g. to_read is 100 and max_bytes is 4 * 30 -> 120 + # will overrun and trigger EOF, build less iterations + # will generate 3 * 30 -> 90 this time and we indicate more + # a 2GB file in read mode will get one loop of 64666 x 32768 (2119006656 / 32768) + # and a second loop with 1 x 31168 + count = to_read / @read_chunk_size + more = true + end + LoopControlResult.new(count, @read_chunk_size, more) + end + + def reset_bytes_unread + # called from shrink + @bytes_unread = 0 + end + + def set_state(value) + @recent_states.shift if @recent_states.size == 8 + @recent_states << @state unless @state.nil? + @state = value + end + + def recent_state_history + @recent_states + Array(@state) + end + + def file_closable? + file_can_close? && all_read? + end + + def file_ignorable? + return false unless expiry_ignore_enabled? + # (Time.now - stat.mtime) <- in jruby, this does int and float + # conversions before the subtraction and returns a float. + # so use all floats upfront + (Time.now.to_f - modified_at) > @settings.ignore_older + end + + def file_can_close? + return false unless expiry_close_enabled? + (Time.now.to_f - @accessed_at) > @settings.close_older + end + + def details + detail = "@filename='#{@filename}', @state=#{@state.inspect}, @recent_states=#{@recent_states.inspect}, " + detail.concat("@bytes_read=#{@bytes_read}, @bytes_unread=#{@bytes_unread}, current_size=#{current_size}, ") + detail.concat("last_stat_size=#{last_stat_size}, file_open?=#{file_open?}, @initial=#{@initial}") + "" + end + + def inspect + "" + end + + def to_s + inspect + end + + private + + def update_bytes_unread + unread = current_size - @bytes_read + @bytes_unread = unread < 0 ? 0 : unread + end + end +end diff --git a/lib/filewatch/watched_files_collection.rb b/lib/filewatch/watched_files_collection.rb new file mode 100644 index 00000000..cbdd46e6 --- /dev/null +++ b/lib/filewatch/watched_files_collection.rb @@ -0,0 +1,22 @@ +# encoding: utf-8 + +require 'java' + +module FileWatch + # @see `org.logstash.filewatch.WatchedFilesCollection` + class WatchedFilesCollection + + # Closes all managed watched files. + # @see FileWatch::WatchedFile#file_close + def close_all + each_file(&:file_close) # synchronized + end + + # @return [Enumerable] managed path keys (snapshot) + alias keys paths + + # @return [Enumerable] managed files (snapshot) + alias values files + + end +end diff --git a/lib/filewatch/winhelper.rb b/lib/filewatch/winhelper.rb new file mode 100644 index 00000000..31c5279e --- /dev/null +++ b/lib/filewatch/winhelper.rb @@ -0,0 +1,207 @@ +# encoding: utf-8 +require "ffi" + +module Winhelper + extend FFI::Library + + ffi_lib 'kernel32' + ffi_convention :stdcall + + class FileTime < FFI::Struct + layout :lowDateTime, :uint, :highDateTime, :uint + end + + #http://msdn.microsoft.com/en-us/library/windows/desktop/aa363788(v=vs.85).aspx + class FileInformation < FFI::Struct + layout :fileAttributes, :uint, #DWORD dwFileAttributes; + :createTime, FileTime, # FILETIME ftCreationTime; + :lastAccessTime, FileTime, # FILETIME ftLastAccessTime; + :lastWriteTime, FileTime, # FILETIME ftLastWriteTime; + :volumeSerialNumber, :uint, # DWORD dwVolumeSerialNumber; + :fileSizeHigh, :uint, # DWORD nFileSizeHigh; + :fileSizeLow, :uint, # DWORD nFileSizeLow; + :numberOfLinks, :uint, # DWORD nNumberOfLinks; + :fileIndexHigh, :uint, # DWORD nFileIndexHigh; + :fileIndexLow, :uint # DWORD nFileIndexLow; + end + + # https://msdn.microsoft.com/en-us/library/windows/desktop/hh965605(v=vs.85).aspx + class FileId128 < FFI::Struct + layout :lowPart, :ulong_long, :highPart, :ulong_long + end + + # https://msdn.microsoft.com/en-us/library/windows/desktop/hh802691(v=vs.85).aspx + class FileIdInfo < FFI::Struct + layout :volumeSerialNumber, :ulong_long, :fileId, FileId128 + # ULONGLONG VolumeSerialNumber; + # FILE_ID_128 FileId; + end + + FileInfoEnum = enum( + :FileBasicInfo, + :FileStandardInfo, + :FileNameInfo, + :FileRenameInfo, + :FileDispositionInfo, + :FileAllocationInfo, + :FileEndOfFileInfo, + :FileStreamInfo, + :FileCompressionInfo, + :FileAttributeTagInfo, + :FileIdBothDirectoryInfo, + :FileIdBothDirectoryRestartInfo, + :FileIoPriorityHintInfo, + :FileRemoteProtocolInfo, + :FileFullDirectoryInfo, + :FileFullDirectoryRestartInfo, + :FileStorageInfo, + :FileAlignmentInfo, + :FileIdInfo, + :FileIdExtdDirectoryInfo, + :FileIdExtdDirectoryRestartInfo + ) + + #http://msdn.microsoft.com/en-us/library/windows/desktop/aa363858(v=vs.85).aspx + #HANDLE WINAPI CreateFile( + # _In_ LPCTSTR lpFileName, + # _In_ DWORD dwDesiredAccess, + # _In_ DWORD dwShareMode, + # _In_opt_ LPSECURITY_ATTRIBUTES lpSecurityAttributes, + # _In_ DWORD dwCreationDisposition, + # _In_ DWORD dwFlagsAndAttributes, _In_opt_ HANDLE hTemplateFile); + attach_function :CreateFileA, [:pointer, :uint, :uint, :pointer, :uint, :uint, :pointer], :pointer + attach_function :CreateFileW, [:pointer, :uint, :uint, :pointer, :uint, :uint, :pointer], :pointer + + #http://msdn.microsoft.com/en-us/library/windows/desktop/aa364952(v=vs.85).aspx + #BOOL WINAPI GetFileInformationByHandle( + # _In_ HANDLE hFile, + # _Out_ LPBY_HANDLE_FILE_INFORMATION lpFileInformation); + attach_function :GetFileInformationByHandle, [:pointer, :pointer], :int + + #https://msdn.microsoft.com/en-us/library/windows/desktop/aa364953(v=vs.85).aspx + #BOOL WINAPI GetFileInformationByHandleEx( + # _In_ HANDLE hFile, + # _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, + # _Out_ LPVOID lpFileInformation, + # _In_ DWORD dwBufferSize ); + attach_function :GetFileInformationByHandleEx, [:pointer, FileInfoEnum, :pointer, :uint], :uint + + attach_function :CloseHandle, [:pointer], :int + + #https://msdn.microsoft.com/en-us/library/windows/desktop/aa964920(v=vs.85).aspx + #BOOL WINAPI GetVolumeInformationByHandleW( + # _In_ HANDLE hFile, + # _Out_opt_ LPWSTR lpVolumeNameBuffer, + # _In_ DWORD nVolumeNameSize, + # _Out_opt_ LPDWORD lpVolumeSerialNumber, + # _Out_opt_ LPDWORD lpMaximumComponentLength, + # _Out_opt_ LPDWORD lpFileSystemFlags, + # _Out_opt_ LPWSTR lpFileSystemNameBuffer, + # _In_ DWORD nFileSystemNameSize); + attach_function :GetVolumeInformationByHandleW, [:pointer, :pointer, :uint, :pointer, :pointer, :pointer, :pointer, :uint], :int + + def self.file_system_type_from_path(path) + file_system_type_from_handle(open_handle_from_path(path)) + end + + def self.file_system_type_from_io(io) + FileWatch::FileExt.io_handle(io) do |pointer| + file_system_type_from_handle(pointer, false) + end + end + + def self.file_system_type_from_handle(handle, close_handle = true) + out = FFI::MemoryPointer.new(:char, 256, true) + if GetVolumeInformationByHandleW(handle, nil, 0, nil, nil, nil, out, 256) > 0 + char_pointer_to_ruby_string(out) + else + "unknown" + end + ensure + CloseHandle(handle) if close_handle + end + + def self.identifier_from_io(io) + FileWatch::FileExt.io_handle(io) do |pointer| + identifier_from_handle(pointer, false) + end + end + + def self.identifier_from_path(path) + identifier_from_handle(open_handle_from_path(path)) + end + + def self.identifier_from_path_ex(path) + identifier_from_handle_ex(open_handle_from_path(path)) + end + + def self.identifier_from_io_ex(io) + FileWatch::FileExt.io_handle(io) do |pointer| + identifier_from_handle_ex(pointer, false) + end + end + + def self.identifier_from_handle_ex(handle, close_handle = true) + fileIdInfo = Winhelper::FileIdInfo.new + success = GetFileInformationByHandleEx(handle, :FileIdInfo, fileIdInfo, fileIdInfo.size) + if success > 0 + vsn = fileIdInfo[:volumeSerialNumber] + lpfid = fileIdInfo[:fileId][:lowPart] + hpfid = fileIdInfo[:fileId][:highPart] + return "#{vsn}-#{lpfid}-#{hpfid}" + else + return 'unknown' + end + ensure + CloseHandle(handle) if close_handle + end + + def self.identifier_from_handle(handle, close_handle = true) + fileInfo = Winhelper::FileInformation.new + success = GetFileInformationByHandle(handle, fileInfo) + if success > 0 + #args = [ + # fileInfo[:fileAttributes], fileInfo[:volumeSerialNumber], fileInfo[:fileSizeHigh], fileInfo[:fileSizeLow], + # fileInfo[:numberOfLinks], fileInfo[:fileIndexHigh], fileInfo[:fileIndexLow] + # ] + #p "Information: %u %u %u %u %u %u %u " % args + #this is only guaranteed on NTFS, for ReFS on windows 2012, GetFileInformationByHandleEx should be used with FILE_ID_INFO, which returns a 128 bit identifier + return "#{fileInfo[:volumeSerialNumber]}-#{fileInfo[:fileIndexLow]}-#{fileInfo[:fileIndexHigh]}" + else + return 'unknown' + end + ensure + CloseHandle(handle) if close_handle + end + + private + + def self.open_handle_from_path(path) + CreateFileW(utf16le(path), 0, 7, nil, 3, 128, nil) + end + + def self.char_pointer_to_ruby_string(char_pointer, length = 256) + bytes = char_pointer.get_array_of_uchar(0, length) + ignore = bytes.reverse.index{|b| b != 0} - 1 + our_bytes = bytes[0, bytes.length - ignore] + our_bytes.pack("C*").force_encoding("UTF-16LE").encode("UTF-8") + end + + def self.utf16le(string) + to_cstring(string).encode("UTF-16LE") + end + + def self.to_cstring(rubystring) + rubystring + 0.chr + end + + def self.win1252(string) + string.encode("Windows-1252") + end +end + + +#fileId = Winhelper.GetWindowsUniqueFileIdentifier('C:\inetpub\logs\LogFiles\W3SVC1\u_ex1fdsadfsadfasdf30612.log') +#p "FileId: " + fileId +#p "outside function, sleeping" +#sleep(10) diff --git a/lib/logstash/inputs/delete_completed_file_handler.rb b/lib/logstash/inputs/delete_completed_file_handler.rb new file mode 100644 index 00000000..6f8e8d0f --- /dev/null +++ b/lib/logstash/inputs/delete_completed_file_handler.rb @@ -0,0 +1,14 @@ +# encoding: utf-8 + +module LogStash module Inputs + class DeleteCompletedFileHandler + def initialize(watch) + @watch = watch + end + + def handle(path) + Pathname.new(path).unlink rescue nil + @watch.watched_files_collection.remove_paths([path]) + end + end +end end diff --git a/lib/logstash/inputs/file.rb b/lib/logstash/inputs/file.rb index 19185867..fe489f1a 100644 --- a/lib/logstash/inputs/file.rb +++ b/lib/logstash/inputs/file.rb @@ -2,9 +2,19 @@ require "logstash/namespace" require "logstash/inputs/base" require "logstash/codecs/identity_map_codec" +require 'logstash/plugin_mixins/ecs_compatibility_support' require "pathname" require "socket" # for Socket.gethostname +require "fileutils" +require "concurrent/atomic/atomic_reference" + +require_relative "file/patch" +require_relative "file_listener" +require_relative "delete_completed_file_handler" +require_relative "log_completed_file_handler" +require_relative "friendly_durations" +require "filewatch/bootstrap" # Stream events from files, normally by tailing them in a manner # similar to `tail -0F` but optionally reading them from the @@ -22,6 +32,12 @@ # beginning to end and storing all of it in a single event (not even # with the multiline codec or filter). # +# ==== Reading from remote network volumes +# +# The file input is not tested on remote filesystems such as NFS, Samba, s3fs-fuse, etc. These +# remote filesystems typically have behaviors that are very different from local filesystems and +# are therefore unlikely to work correctly when used with the file input. +# # ==== Tracking of current position in watched files # # The plugin keeps track of the current position in each file by @@ -69,25 +85,12 @@ # to the rotation and its reopening under the new name (an interval # determined by the `stat_interval` and `discover_interval` options) # will not get picked up. - -class LogStash::Codecs::Base - # TODO - move this to core - if !method_defined?(:accept) - def accept(listener) - decode(listener.data) do |event| - listener.process_event(event) - end - end - end - if !method_defined?(:auto_flush) - def auto_flush(*) - end - end -end - -class LogStash::Inputs::File < LogStash::Inputs::Base +module LogStash module Inputs +class File < LogStash::Inputs::Base config_name "file" + include PluginMixins::ECSCompatibilitySupport(:disabled, :v1, :v8 => :v1) + # The path(s) to the file(s) to use as an input. # You can use filename patterns here, such as `/var/log/*.log`. # If you use a pattern like `/var/log/**/*.log`, a recursive search @@ -111,7 +114,7 @@ class LogStash::Inputs::File < LogStash::Inputs::Base # How often (in seconds) we stat files to see if they have been modified. # Increasing this interval will decrease the number of system calls we make, # but increase the time to detect new log lines. - config :stat_interval, :validate => :number, :default => 1 + config :stat_interval, :validate => [FriendlyDurations, "seconds"], :default => 1 # How often (in seconds) we expand the filename patterns in the # `path` option to discover new files to watch. @@ -119,13 +122,13 @@ class LogStash::Inputs::File < LogStash::Inputs::Base # Path of the sincedb database file (keeps track of the current # position of monitored log files) that will be written to disk. - # The default will write sincedb files to some path matching `$HOME/.sincedb*` + # The default will write sincedb files to `/plugins/inputs/file` # NOTE: it must be a file path and not a directory path config :sincedb_path, :validate => :string # How often (in seconds) to write a since database with the current position of # monitored log files. - config :sincedb_write_interval, :validate => :number, :default => 15 + config :sincedb_write_interval, :validate => [FriendlyDurations, "seconds"], :default => 15 # Choose where Logstash starts initially reading files: at the beginning or # at the end. The default behavior treats files like live streams and thus @@ -144,19 +147,19 @@ class LogStash::Inputs::File < LogStash::Inputs::Base # When the file input discovers a file that was last modified # before the specified timespan in seconds, the file is ignored. - # After it's discovery, if an ignored file is modified it is no - # longer ignored and any new data is read. The default is 24 hours. - config :ignore_older, :validate => :number, :default => 24 * 60 * 60 + # After its discovery, if an ignored file is modified it is no + # longer ignored and any new data is read. By default, this option is + # disabled. Note this unit is in seconds. + config :ignore_older, :validate => [FriendlyDurations, "seconds"] # The file input closes any files that were last read the specified # timespan in seconds ago. - # This has different implications depending on if a file is being tailed or - # read. If tailing, and there is a large time gap in incoming data the file + # If tailing, and there is a large time gap in incoming data the file # can be closed (allowing other files to be opened) but will be queued for # reopening when new data is detected. If reading, the file will be closed # after closed_older seconds from when the last bytes were read. # The default is 1 hour - config :close_older, :validate => :number, :default => 1 * 60 * 60 + config :close_older, :validate => [FriendlyDurations, "seconds"], :default => "1 hour" # What is the maximum number of file_handles that this input consumes # at any one time. Use close_older to close some files if you need to @@ -166,15 +169,101 @@ class LogStash::Inputs::File < LogStash::Inputs::Base # The default of 4095 is set in filewatch. config :max_open_files, :validate => :number + # What mode do you want the file input to operate in. + # Tail a few files or read many content-complete files + # The default is tail + # If "read" is specified then the following other settings are ignored + # `start_position` (files are always read from the beginning) + # `delimiter` (files are assumed to use \n or \r (or both) as line endings) + # `close_older` (files are automatically 'closed' when EOF is reached) + # If "read" is specified then the following settings are heeded + # `ignore_older` (older files are not processed) + # "read" mode now supports gzip file processing + config :mode, :validate => [ "tail", "read"], :default => "tail" + + # When in 'read' mode, what action should be carried out when a file is done with. + # If 'delete' is specified then the file will be deleted. + # If 'log' is specified then the full path of the file is logged to the file specified + # in the `file_completed_log_path` setting. + config :file_completed_action, :validate => ["delete", "log", "log_and_delete"], :default => "delete" + + # Which file should the completely read file paths be appended to. + # Only specify this path to a file when `file_completed_action` is 'log' or 'log_and_delete'. + # IMPORTANT: this file is appended to only - it could become very large. You are responsible for file rotation. + config :file_completed_log_path, :validate => :string + + # The sincedb entry now has a last active timestamp associated with it. + # If no changes are detected in tracked files in the last N days their sincedb + # tracking record will expire and not be persisted. + # This option protects against the well known inode recycling problem. (add reference) + config :sincedb_clean_after, :validate => [FriendlyDurations, "days"], :default => "14 days" # days + + # File content is read off disk in blocks or chunks, then using whatever the set delimiter + # is, lines are extracted from the chunk. Specify the size in bytes of each chunk. + # See `file_chunk_count` to see why and when to change this from the default. + # The default set internally is 32768 (32KB) + config :file_chunk_size, :validate => :number, :default => FileWatch::FILE_READ_SIZE + + # When combined with the `file_chunk_size`, this option sets how many chunks + # are read from each file before moving to the next active file. + # e.g. a `chunk_count` of 32 with the default `file_chunk_size` will process + # 1MB from each active file. See the option `max_open_files` for more info. + # The default set internally is very large, 4611686018427387903. By default + # the file is read to the end before moving to the next active file. + config :file_chunk_count, :validate => :number, :default => FileWatch::MAX_ITERATIONS + + # Which attribute of a discovered file should be used to sort the discovered files. + # Files can be sort by modified date or full path alphabetic. + # The default is `last_modified` + # Previously the processing order of the discovered files was OS dependent. + config :file_sort_by, :validate => ["last_modified", "path"], :default => "last_modified" + + # Choose between ascending and descending order when also choosing between + # `last_modified` and `path` file_sort_by options. + # If ingesting the newest data first is important then opt for last_modified + desc + # If ingesting the oldest data first is important then opt for last_modified + asc + # If you use a special naming convention for the file full paths then + # perhaps path + asc will help to achieve the goal of controlling the order of file ingestion + config :file_sort_direction, :validate => ["asc", "desc"], :default => "asc" + + # When in 'read' mode - this option is closing all file watchers when EOF is hit + # This option also disables discovery of new/changes files. It works only on files found at the beginning + # Sincedb still works, if you run LS once again after doing some changes - only new values will be read + config :exit_after_read, :validate => :boolean, :default => false + + # Before start read a compressed file, checks for its validity. + # This request a full read of the archive, so potentially could cost time. + # If not specified to true, and the file is corrupted, could end in cyclic processing of the broken file. + config :check_archive_validity, :validate => :boolean, :default => false + public + + class << self + alias_method :old_validate_value, :validate_value + + def validate_value(value, validator) + if validator.is_a?(Array) && validator.size == 2 && validator.first.respond_to?(:call) + callable, units = *validator + # returns a ValidatedStruct having a `to_a` method suitable to return to the config mixin caller + return callable.call(value, units).to_a + end + old_validate_value(value, validator) + end + end + + # @private used in specs + attr_reader :watcher + def register require "addressable/uri" - require "filewatch/tail" require "digest/md5" - @logger.info("Registering file input", :path => @path) + @logger.trace("Registering file input", :path => @path) @host = Socket.gethostname.force_encoding(Encoding::UTF_8) + # This check is Logstash 5 specific. If the class does not exist, and it + # won't in older versions of Logstash, then we need to set it to nil. + settings = defined?(LogStash::SETTINGS) ? LogStash::SETTINGS : nil - @tail_config = { + @filewatch_config = { :exclude => @exclude, :stat_interval => @stat_interval, :discover_interval => @discover_interval, @@ -182,7 +271,14 @@ def register :delimiter => @delimiter, :ignore_older => @ignore_older, :close_older => @close_older, - :max_open_files => @max_open_files + :max_open_files => @max_open_files, + :sincedb_clean_after => @sincedb_clean_after, + :file_chunk_count => @file_chunk_count, + :file_chunk_size => @file_chunk_size, + :file_sort_by => @file_sort_by, + :file_sort_direction => @file_sort_direction, + :exit_after_read => @exit_after_read, + :check_archive_validity => @check_archive_validity, } @path.each do |path| @@ -192,135 +288,173 @@ def register end if @sincedb_path.nil? - if ENV["SINCEDB_DIR"].nil? && ENV["HOME"].nil? - @logger.error("No SINCEDB_DIR or HOME environment variable set, I don't know where " \ - "to keep track of the files I'm watching. Either set " \ - "HOME or SINCEDB_DIR in your environment, or set sincedb_path in " \ - "in your Logstash config for the file input with " \ - "path '#{@path.inspect}'") - raise # TODO(sissel): HOW DO I FAIL PROPERLY YO - end - - #pick SINCEDB_DIR if available, otherwise use HOME - sincedb_dir = ENV["SINCEDB_DIR"] || ENV["HOME"] - - # Join by ',' to make it easy for folks to know their own sincedb - # generated path (vs, say, inspecting the @path array) - @sincedb_path = File.join(sincedb_dir, ".sincedb_" + Digest::MD5.hexdigest(@path.join(","))) - - # Migrate any old .sincedb to the new file (this is for version <=1.1.1 compatibility) - old_sincedb = File.join(sincedb_dir, ".sincedb") - if File.exists?(old_sincedb) - @logger.info("Renaming old ~/.sincedb to new one", :old => old_sincedb, - :new => @sincedb_path) - File.rename(old_sincedb, @sincedb_path) + base_sincedb_path = build_sincedb_base_from_settings(settings) || build_sincedb_base_from_env + @sincedb_path = build_random_sincedb_filename(base_sincedb_path) + @logger.info('No sincedb_path set, generating one based on the "path" setting', :sincedb_path => @sincedb_path.to_s, :path => @path) + else + @sincedb_path = Pathname.new(@sincedb_path) + if @sincedb_path.directory? + raise ArgumentError.new("The \"sincedb_path\" argument must point to a file, received a directory: \"#{@sincedb_path}\"") end - - @logger.info("No sincedb_path set, generating one based on the file path", - :sincedb_path => @sincedb_path, :path => @path) end - - if File.directory?(@sincedb_path) - raise ArgumentError.new("The \"sincedb_path\" argument must point to a file, received a directory: \"#{@sincedb_path}\"") + + @filewatch_config[:sincedb_path] = @sincedb_path + + @filewatch_config[:start_new_files_at] = @start_position.to_sym + + if @file_completed_action.include?('log') + if @file_completed_log_path.nil? + raise ArgumentError.new('The "file_completed_log_path" setting must be provided when the "file_completed_action" is set to "log" or "log_and_delete"') + else + @file_completed_log_path = Pathname.new(@file_completed_log_path) + unless @file_completed_log_path.exist? + begin + FileUtils.touch(@file_completed_log_path) + rescue + raise ArgumentError.new("The \"file_completed_log_path\" file can't be created: #{@file_completed_log_path}") + end + end + end end - @tail_config[:sincedb_path] = @sincedb_path - - if @start_position == "beginning" - @tail_config[:start_new_files_at] = :beginning + if tail_mode? + if @exit_after_read + raise ArgumentError.new('The "exit_after_read" setting only works when the "mode" is set to "read"') + end + @watcher_class = FileWatch::ObservingTail + else + @watcher_class = FileWatch::ObservingRead end - @codec = LogStash::Codecs::IdentityMapCodec.new(@codec) - end # def register - - class ListenerTail - # use attr_reader to define noop methods - attr_reader :input, :path, :data - attr_reader :deleted, :created, :error, :eof - - # construct with upstream state - def initialize(path, input) - @path, @input = path, input - end - - def timed_out - input.codec.evict(path) - end - - def accept(data) - # and push transient data filled dup listener downstream - input.log_line_received(path, data) - input.codec.accept(dup_adding_state(data)) - end - - def process_event(event) - event.set("[@metadata][path]", path) - event.set("path", path) if !event.include?("path") - input.post_process_this(event) - end + @completely_stopped = Concurrent::AtomicBoolean.new + @queue = Concurrent::AtomicReference.new - def add_state(data) - @data = data - self - end - - private - - # duplicate and add state for downstream - def dup_adding_state(line) - self.class.new(path, input).add_state(line) - end - end + @source_host_field = ecs_select[disabled: 'host', v1:'[host][name]'] + @source_path_field = ecs_select[disabled: 'path', v1:'[log][file][path]'] + end # def register - class FlushableListener < ListenerTail - attr_writer :path + def completely_stopped? + # to synchronise after(:each) blocks in tests that remove the sincedb file before atomic_write completes + @completely_stopped.true? end + # The WatchedFile calls back here as `observer.listener_for(@path)` + # @param [String] path the identity def listener_for(path) - # path is the identity - ListenerTail.new(path, self) + FileListener.new(path, self) end - def begin_tailing + def start_processing # if the pipeline restarts this input, # make sure previous files are closed stop - # use observer listener api - @tail = FileWatch::Tail.new_observing(@tail_config) - @tail.logger = @logger - @path.each { |path| @tail.tail(path) } + + @watcher = @watcher_class.new(@filewatch_config) + + @completed_file_handlers = [] + if read_mode? + if @file_completed_action.include?('log') + @completed_file_handlers << LogCompletedFileHandler.new(@file_completed_log_path) + end + if @file_completed_action.include?('delete') + @completed_file_handlers << DeleteCompletedFileHandler.new(@watcher.watch) + end + end + + @path.each { |path| @watcher.watch_this(path) } end def run(queue) - begin_tailing - @queue = queue - @tail.subscribe(self) + start_processing + @queue.set queue + @watcher.subscribe(self) # halts here until quit is called + # last action of the subscribe call is to write the sincedb exit_flush + @completely_stopped.make_true end # def run - def post_process_this(event) - event.set("host", @host) if !event.include?("host") + def post_process_this(event, path) + event.set("[@metadata][path]", path) + event.set("[@metadata][host]", @host) + attempt_set(event, @source_host_field, @host) + attempt_set(event, @source_path_field, path) if path + decorate(event) - @queue << event + @queue.get << event + end + + def handle_deletable_path(path) + return if tail_mode? + return if @completed_file_handlers.empty? + @logger.debug? && @logger.debug(__method__.to_s, :path => path) + @completed_file_handlers.each { |handler| handler.handle(path) } end def log_line_received(path, line) - return if !@logger.debug? - @logger.debug("Received line", :path => path, :text => line) + @logger.debug? && @logger.debug("Received line", :path => path, :text => line) end def stop - # in filewatch >= 0.6.7, quit will closes and forget all files - # but it will write their last read positions to since_db - # beforehand - if @tail + unless @watcher.nil? @codec.close - @tail.quit + @watcher.quit end end + # @private used in specs + def queue + @queue.get + end + private + def build_sincedb_base_from_settings(settings) + logstash_data_path = settings.get_value("path.data") + Pathname.new(logstash_data_path).join("plugins", "inputs", "file").tap do |path| + # Ensure that the filepath exists before writing, since it's deeply nested. + path.mkpath + end + end + + # Attempt to set an event's field to the provided value + # without overwriting an existing value or producing an error + def attempt_set(event, field_reference, value) + return false if event.include?(field_reference) + + event.set(field_reference, value) + rescue => e + logger.trace("failed to set #{field_reference} to `#{value}`", :exception => e.message) + false + end + + def build_sincedb_base_from_env + # This section is going to be deprecated eventually, as path.data will be + # the default, not an environment variable (SINCEDB_DIR or LOGSTASH_HOME) + if ENV["SINCEDB_DIR"].nil? && ENV["LOGSTASH_HOME"].nil? + @logger.error("No SINCEDB_DIR or LOGSTASH_HOME environment variable set, I don't know where " \ + "to keep track of the files I'm watching. Either set " \ + "LOGSTASH_HOME or SINCEDB_DIR in your environment, or set sincedb_path in " \ + "in your Logstash config for the file input with " \ + "path '#{@path.inspect}'") + raise ArgumentError.new('The "sincedb_path" setting was not given and the environment variables "SINCEDB_DIR" or "LOGSTASH_HOME" are not set so we cannot build a file path for the sincedb') + end + Pathname.new(ENV["SINCEDB_DIR"] || ENV["LOGSTASH_HOME"]) + end + + def build_random_sincedb_filename(pathname) + # Join by ',' to make it easy for folks to know their own sincedb + # generated path (vs, say, inspecting the @path array) + pathname.join(".sincedb_" + Digest::MD5.hexdigest(@path.join(","))) + end + + def tail_mode? + @mode == "tail" + end + + def read_mode? + !tail_mode? + end + def exit_flush listener = FlushableListener.new("none", self) if @codec.identity_count.zero? @@ -336,4 +470,4 @@ def exit_flush @codec.flush_mapped(listener) end end -end # class LogStash::Inputs::File +end end end# class LogStash::Inputs::File diff --git a/lib/logstash/inputs/file/patch.rb b/lib/logstash/inputs/file/patch.rb new file mode 100644 index 00000000..a54fb98b --- /dev/null +++ b/lib/logstash/inputs/file/patch.rb @@ -0,0 +1,16 @@ +# encoding: utf-8 +class LogStash::Codecs::Base + # TODO - move this to core + if !method_defined?(:accept) + def accept(listener) + decode(listener.data) do |event| + listener.process_event(event) + end + end + end + if !method_defined?(:auto_flush) + def auto_flush(*) + end + end +end + diff --git a/lib/logstash/inputs/file_listener.rb b/lib/logstash/inputs/file_listener.rb new file mode 100644 index 00000000..5a264a66 --- /dev/null +++ b/lib/logstash/inputs/file_listener.rb @@ -0,0 +1,51 @@ +# encoding: utf-8 + +module LogStash module Inputs + # As and when a new WatchedFile is processed FileWatch asks for an instance of this class for the + # file path of that WatchedFile. All subsequent callbacks are sent via this listener instance. + # The file is essentially a stream and the path is the identity of that stream. + class FileListener + attr_reader :input, :path, :data + # construct with link back to the input plugin instance. + def initialize(path, input, data = nil) + @path, @input = path, input + @data = data + end + + def opened + end + + def eof + end + + def error + end + + def reading_completed + end + + def timed_out + input.codec.evict(path) + end + + def deleted + input.codec.evict(path) + input.handle_deletable_path(path) + end + + def accept(data) + # and push transient data filled dup listener downstream + input.log_line_received(path, data) + input.codec.accept(self.class.new(path, input, data)) + end + + def process_event(event) + input.post_process_this(event, path) + end + + end + + class FlushableListener < FileListener + attr_writer :path + end +end end diff --git a/lib/logstash/inputs/friendly_durations.rb b/lib/logstash/inputs/friendly_durations.rb new file mode 100644 index 00000000..07d0ee76 --- /dev/null +++ b/lib/logstash/inputs/friendly_durations.rb @@ -0,0 +1,45 @@ +# encoding: utf-8 + +module LogStash module Inputs + module FriendlyDurations + NUMBERS_RE = /^(?\d+(\.\d+)?)\s?(?s((ec)?(ond)?)(s)?|m((in)?(ute)?)(s)?|h(our)?(s)?|d(ay)?(s)?|w(eek)?(s)?|us(ec)?(s)?|ms(ec)?(s)?)?$/ + HOURS = 3600 + DAYS = 24 * HOURS + MEGA = 10**6 + KILO = 10**3 + + ValidatedStruct = Struct.new(:value, :error_message) do + def to_a + error_message.nil? ? [true, value] : [false, error_message] + end + end + + def self.call(value, unit = "sec") + # coerce into seconds + val_string = value.to_s.strip + matched = NUMBERS_RE.match(val_string) + if matched.nil? + failed_message = "Value '#{val_string}' is not a valid duration string e.g. 200 usec, 250ms, 60 sec, 18h, 21.5d, 1 day, 2w, 6 weeks" + return ValidatedStruct.new(nil, failed_message) + end + multiplier = matched[:units] || unit + numeric = matched[:number].to_f + case multiplier + when "m","min","mins","minute","minutes" + ValidatedStruct.new(numeric * 60, nil) + when "h","hour","hours" + ValidatedStruct.new(numeric * HOURS, nil) + when "d","day","days" + ValidatedStruct.new(numeric * DAYS, nil) + when "w","week","weeks" + ValidatedStruct.new(numeric * 7 * DAYS, nil) + when "ms","msec","msecs" + ValidatedStruct.new(numeric / KILO, nil) + when "us","usec","usecs" + ValidatedStruct.new(numeric / MEGA, nil) + else + ValidatedStruct.new(numeric, nil) + end + end + end +end end diff --git a/lib/logstash/inputs/log_completed_file_handler.rb b/lib/logstash/inputs/log_completed_file_handler.rb new file mode 100644 index 00000000..5210ac72 --- /dev/null +++ b/lib/logstash/inputs/log_completed_file_handler.rb @@ -0,0 +1,13 @@ +# encoding: utf-8 + +module LogStash module Inputs + class LogCompletedFileHandler + def initialize(log_completed_file_path) + @log_completed_file_path = Pathname.new(log_completed_file_path) + end + + def handle(path) + @log_completed_file_path.open("a") { |fd| fd.puts(path) } + end + end +end end diff --git a/logstash-input-file.gemspec b/logstash-input-file.gemspec index 07fa3c42..032112f6 100644 --- a/logstash-input-file.gemspec +++ b/logstash-input-file.gemspec @@ -1,9 +1,9 @@ Gem::Specification.new do |s| s.name = 'logstash-input-file' - s.version = '3.0.3' - s.licenses = ['Apache License (2.0)'] - s.summary = "Stream events from files." + s.version = '4.4.6' + s.licenses = ['Apache-2.0'] + s.summary = "Streams events from files" s.description = "This gem is a Logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/logstash-plugin install gemname. This gem is not a stand-alone program" s.authors = ["Elastic"] s.email = 'info@elastic.co' @@ -11,7 +11,7 @@ Gem::Specification.new do |s| s.require_paths = ["lib"] # Files - s.files = Dir['lib/**/*','spec/**/*','vendor/**/*','*.gemspec','*.md','CONTRIBUTORS','Gemfile','LICENSE','NOTICE.TXT'] + s.files = Dir["lib/**/*","spec/**/*","*.gemspec","*.md","CONTRIBUTORS","Gemfile","LICENSE","NOTICE.TXT", "vendor/jar-dependencies/**/*.jar", "vendor/jar-dependencies/**/*.rb", "VERSION", "JAR_VERSION", "docs/**/*"] # Tests s.test_files = s.files.grep(%r{^(test|spec|features)/}) @@ -23,13 +23,22 @@ Gem::Specification.new do |s| s.add_runtime_dependency "logstash-core-plugin-api", ">= 1.60", "<= 2.99" s.add_runtime_dependency 'logstash-codec-plain' - s.add_runtime_dependency 'addressable' - s.add_runtime_dependency 'filewatch', ['>= 0.8.1', '~> 0.8'] + + if RUBY_VERSION.start_with?("1") + s.add_runtime_dependency 'rake', '~> 12.2.0' + s.add_runtime_dependency 'addressable', '~> 2.4.0' + else + s.add_runtime_dependency 'addressable' + end + + s.add_runtime_dependency 'concurrent-ruby', '~> 1.0' s.add_runtime_dependency 'logstash-codec-multiline', ['~> 3.0'] + s.add_runtime_dependency 'logstash-mixin-ecs_compatibility_support', '~>1.3' s.add_development_dependency 'stud', ['~> 0.0.19'] s.add_development_dependency 'logstash-devutils' s.add_development_dependency 'logstash-codec-json' s.add_development_dependency 'rspec-sequencing' + s.add_development_dependency "rspec-wait" + s.add_development_dependency 'timecop' end - diff --git a/run_until_fail.sh b/run_until_fail.sh new file mode 100755 index 00000000..3814284c --- /dev/null +++ b/run_until_fail.sh @@ -0,0 +1,4 @@ +while true +do + LOG_AT=ERROR bundle exec rspec -fd --fail-fast --tag ~lsof ./spec || break +done diff --git a/settings.gradle b/settings.gradle new file mode 100644 index 00000000..73158f09 --- /dev/null +++ b/settings.gradle @@ -0,0 +1 @@ +rootProject.name = 'filewatch' \ No newline at end of file diff --git a/spec/file_ext/file_ext_windows_spec.rb b/spec/file_ext/file_ext_windows_spec.rb new file mode 100644 index 00000000..df54d989 --- /dev/null +++ b/spec/file_ext/file_ext_windows_spec.rb @@ -0,0 +1,36 @@ +# encoding: utf-8 + +require_relative '../filewatch/spec_helper' + +if LogStash::Environment.windows? + describe "basic ops" do + let(:fixture_dir) { Pathname.new(FileWatch::FIXTURE_DIR).expand_path } + let(:file_path) { fixture_dir.join('uncompressed.log') } + it "path works" do + path = file_path.to_path + identifier = Winhelper.identifier_from_path(path) + STDOUT.puts("--- >>", identifier, "------") + expect(identifier.count('-')).to eq(2) + fs_name = Winhelper.file_system_type_from_path(path) + STDOUT.puts("--- >>", fs_name, "------") + expect(fs_name).to eq("NTFS") + # identifier = Winhelper.identifier_from_path_ex(path) + # STDOUT.puts("--- >>", identifier, "------") + # expect(identifier.count('-')).to eq(2) + end + + it "io works" do + file = FileWatch::FileOpener.open(file_path.to_path) + identifier = Winhelper.identifier_from_io(file) + file.close + STDOUT.puts("--- >>", identifier, "------") + expect(identifier.count('-')).to eq(2) + # fs_name = Winhelper.file_system_type_from_io(file) + # STDOUT.puts("--- >>", fs_name, "------") + # expect(fs_name).to eq("NTFS") + # identifier = Winhelper.identifier_from_path_ex(path) + # STDOUT.puts("--- >>", identifier, "------") + # expect(identifier.count('-')).to eq(2) + end + end +end diff --git a/spec/filewatch/buftok_spec.rb b/spec/filewatch/buftok_spec.rb new file mode 100644 index 00000000..c27cccb1 --- /dev/null +++ b/spec/filewatch/buftok_spec.rb @@ -0,0 +1,25 @@ +# encoding: utf-8 +require_relative 'spec_helper' + +describe FileWatch::BufferedTokenizer do + + context "when using the default delimiter" do + it "splits the lines correctly" do + expect(subject.extract("hello\nworld\n")).to eq ["hello", "world"] + end + + it "holds partial lines back until a token is found" do + buffer = described_class.new + expect(buffer.extract("hello\nwor")).to eq ["hello"] + expect(buffer.extract("ld\n")).to eq ["world"] + end + end + + context "when passing a custom delimiter" do + subject { FileWatch::BufferedTokenizer.new("\r\n") } + + it "splits the lines correctly" do + expect(subject.extract("hello\r\nworld\r\n")).to eq ["hello", "world"] + end + end +end diff --git a/spec/filewatch/read_mode_handlers_read_file_spec.rb b/spec/filewatch/read_mode_handlers_read_file_spec.rb new file mode 100644 index 00000000..5b0a16e6 --- /dev/null +++ b/spec/filewatch/read_mode_handlers_read_file_spec.rb @@ -0,0 +1,80 @@ +# encoding: utf-8 +require_relative 'spec_helper' + +module FileWatch + describe ReadMode::Handlers::ReadFile do + let(:settings) do + Settings.from_options( + :sincedb_write_interval => 0, + :sincedb_path => File::NULL + ) + end + let(:sdb_collection) { SincedbCollection.new(settings) } + let(:directory) { Pathname.new(FIXTURE_DIR) } + let(:pathname) { directory.join('uncompressed.log') } + let(:watched_file) { WatchedFile.new(pathname, PathStatClass.new(pathname), settings) } + let(:processor) { ReadMode::Processor.new(settings).add_watch(watch) } + let(:file) { DummyFileReader.new(settings.file_chunk_size, 2) } + + context "simulate reading a 64KB file with a default chunk size of 32KB and a zero sincedb write interval" do + let(:watch) { double("watch", :quit? => false) } + it "calls 'sincedb_write' exactly 2 times" do + allow(FileOpener).to receive(:open).with(watched_file.path).and_return(file) + expect(sdb_collection).to receive(:sincedb_write).exactly(1).times + watched_file.activate + processor.initialize_handlers(sdb_collection, TestObserver.new) + processor.read_file(watched_file) + end + end + + context "simulate reading a 64KB file with a default chunk size of 32KB and a zero sincedb write interval" do + let(:watch) { double("watch", :quit? => true) } + it "calls 'sincedb_write' exactly 0 times as shutdown is in progress" do + expect(sdb_collection).to receive(:sincedb_write).exactly(0).times + watched_file.activate + processor.initialize_handlers(sdb_collection, TestObserver.new) + processor.read_file(watched_file) + end + end + + context "when restart from existing sincedb" do + let(:settings) do + Settings.from_options( + :sincedb_write_interval => 0, + :sincedb_path => File::NULL, + :file_chunk_size => 10 + ) + end + + let(:processor) { double("fake processor") } + let(:observer) { TestObserver.new } + let(:watch) { double("watch") } + + before(:each) { + allow(watch).to receive(:quit?).and_return(false)#.and_return(false).and_return(true) + allow(processor).to receive(:watch).and_return(watch) + } + + it "read from where it left" do + listener = observer.listener_for(Pathname.new(pathname).to_path) + sut = ReadMode::Handlers::ReadFile.new(processor, sdb_collection, observer, settings) + + # simulate a previous partial read of the file + sincedb_value = SincedbValue.new(0) + sincedb_value.set_watched_file(watched_file) + sdb_collection.set(watched_file.sincedb_key, sincedb_value) + + + # simulate a consumption of first line, (size + newline) bytes + sdb_collection.increment(watched_file.sincedb_key, File.readlines(pathname)[0].size + 2) + + # exercise + sut.handle(watched_file) + + # verify + expect(listener.lines.size).to eq(1) + expect(listener.lines[0]).to start_with("2010-03-12 23:51:21 SEA4 192.0.2.222 play 3914 OK") + end + end + end +end diff --git a/spec/filewatch/reading_spec.rb b/spec/filewatch/reading_spec.rb new file mode 100644 index 00000000..91f7b644 --- /dev/null +++ b/spec/filewatch/reading_spec.rb @@ -0,0 +1,316 @@ +# encoding: utf-8 +require 'stud/temporary' +require_relative 'spec_helper' +require 'filewatch/observing_read' + +module FileWatch + describe Watch do + before(:all) do + @thread_abort = Thread.abort_on_exception + Thread.abort_on_exception = true + end + + after(:all) do + Thread.abort_on_exception = @thread_abort + end + + let(:directory) { Stud::Temporary.directory } + let(:watch_dir) { ::File.join(directory, "*.log") } + let(:file_path) { ::File.join(directory, "1.log") } + let(:sincedb_path) { ::File.join(Stud::Temporary.directory, "reading.sdb") } + let(:stat_interval) { 0.1 } + let(:discover_interval) { 4 } + let(:start_new_files_at) { :end } # should be irrelevant for read mode + let(:opts) do + { + :stat_interval => stat_interval, + :start_new_files_at => start_new_files_at, + :delimiter => "\n", + :discover_interval => discover_interval, + :ignore_older => 3600, + :sincedb_path => sincedb_path + } + end + let(:observer) { TestObserver.new } + let(:reading) { ObservingRead.new(opts) } + let(:listener1) { observer.listener_for(file_path) } + + after do + FileUtils.rm_rf(directory) unless directory =~ /fixture/ + end + + context "when watching a directory with files" do + let(:actions) do + RSpec::Sequencing.run("quit after a short time") do + File.open(file_path, "wb") { |file| file.write("line1\nline2\n") } + end + .then("watch") do + reading.watch_this(watch_dir) + end + .then("wait") do + wait(2).for{listener1.calls.last}.to eq(:delete) + end + .then("quit") do + reading.quit + end + end + it "the file is read" do + actions.activate_quietly + reading.subscribe(observer) + actions.assert_no_errors + expect(listener1.calls).to eq([:open, :accept, :accept, :eof, :delete]) + expect(listener1.lines).to eq(["line1", "line2"]) + end + end + + context "when watching a directory with files and sincedb_path is /dev/null or NUL" do + let(:sincedb_path) { File::NULL } + let(:actions) do + RSpec::Sequencing.run("quit after a short time") do + File.open(file_path, "wb") { |file| file.write("line1\nline2\n") } + end + .then("watch") do + reading.watch_this(watch_dir) + end + .then("wait") do + wait(2).for{listener1.calls.last}.to eq(:delete) + end + .then("quit") do + reading.quit + end + end + it "the file is read" do + actions.activate_quietly + reading.subscribe(observer) + actions.assert_no_errors + expect(listener1.calls).to eq([:open, :accept, :accept, :eof, :delete]) + expect(listener1.lines).to eq(["line1", "line2"]) + end + end + + context "when watching a directory with files using striped reading" do + let(:file_path2) { ::File.join(directory, "2.log") } + # use a chunk size that does not align with the line boundaries + let(:opts) { super().merge(:file_chunk_size => 10, :file_chunk_count => 1, :file_sort_by => "path")} + let(:lines) { [] } + let(:observer) { TestObserver.new(lines) } + let(:listener2) { observer.listener_for(file_path2) } + let(:actions) do + RSpec::Sequencing.run("create file") do + File.open(file_path, "w") { |file| file.write("string1\nstring2") } + File.open(file_path2, "w") { |file| file.write("stringA\nstringB") } + end + .then("watch") do + reading.watch_this(watch_dir) + end + .then("wait") do + wait(2).for{listener1.calls.last == :delete && listener2.calls.last == :delete}.to eq(true) + end + .then("quit") do + reading.quit + end + end + it "the files are read seemingly in parallel" do + actions.activate_quietly + reading.subscribe(observer) + actions.assert_no_errors + expect(listener1.calls).to eq([:open, :accept, :accept, :eof, :delete]) + expect(listener2.calls).to eq([:open, :accept, :accept, :eof, :delete]) + expect(lines).to eq(%w(string1 stringA string2 stringB)) + end + end + + context "when a non default delimiter is specified and it is not in the content" do + let(:opts) { super().merge(:delimiter => "\nø") } + let(:actions) do + RSpec::Sequencing.run("create file") do + File.open(file_path, "wb") { |file| file.write("line1\nline2") } + end + .then("watch") do + reading.watch_this(watch_dir) + end + .then("wait") do + wait(2).for{listener1.calls.last}.to eq(:delete) + end + .then("quit") do + reading.quit + end + end + it "the file is opened, data is read, but no lines are found initially, at EOF the whole file becomes the line" do + actions.activate_quietly + reading.subscribe(observer) + actions.assert_no_errors + expect(listener1.calls).to eq([:open, :accept, :eof, :delete]) + expect(listener1.lines).to eq(["line1\nline2"]) + sincedb_record_fields = File.read(sincedb_path).split(" ") + position_field_index = 3 + # tailing, no delimiter, we are expecting one, if it grows we read from the start. + # there is an info log telling us that no lines were seen but we can't test for it. + expect(sincedb_record_fields[position_field_index]).to eq("11") + end + end + + context "when watching directory with files and adding a new file" do + let(:file_path2) { ::File.join(directory, "2.log") } + let(:file_path3) { ::File.join(directory, "3.log") } + + let(:opts) { super().merge(:file_sort_by => "last_modified") } + let(:lines) { [] } + let(:observer) { TestObserver.new(lines) } + + + let(:listener2) { observer.listener_for(file_path2) } + let(:listener3) { observer.listener_for(file_path3) } + + let(:actions) do + RSpec::Sequencing.run("create12") do + File.open(file_path, "w") { |file| file.write("string11\nstring12") } + File.open(file_path2, "w") { |file| file.write("string21\nstring22") } + end + .then("watch") do + reading.watch_this(watch_dir) + end + .then("wait12") do + wait(2).for { listener1.calls.last == :delete && listener2.calls.last == :delete }.to eq(true) + end + .then_after(2, "create3") do + File.open(file_path3, "w") { |file| file.write("string31\nstring32") } + end + .then("wait3") do + wait(2).for { listener3.calls.last == :delete }.to eq(true) + end + .then("quit") do + reading.quit + end + end + + it "reads all (3) files" do + actions.activate_quietly + reading.subscribe(observer) + actions.assert_no_errors + expect(lines.last).to eq 'string32' + expect(lines.sort).to eq %w(string11 string12 string21 string22 string31 string32) + expect( reading.watch.watched_files_collection.paths ).to eq [ file_path, file_path2, file_path3 ] + end + end + + context "when watching a directory with files using exit_after_read" do + let(:opts) { super().merge(:exit_after_read => true, :max_open_files => 2) } + let(:file_path3) { ::File.join(directory, "3.log") } + let(:file_path4) { ::File.join(directory, "4.log") } + let(:file_path5) { ::File.join(directory, "5.log") } + let(:lines) { [] } + let(:observer) { TestObserver.new(lines) } + let(:listener3) { observer.listener_for(file_path3) } + let(:file_path6) { ::File.join(directory, "6.log") } + let(:listener6) { observer.listener_for(file_path6) } + + it "the file is read" do + File.open(file_path3, "w") { |file| file.write("line1\nline2\n") } + reading.watch_this(watch_dir) + reading.subscribe(observer) + expect(listener3.lines).to eq(["line1", "line2"]) + end + + it "multiple files are read" do + File.open(file_path3, "w") { |file| file.write("line1\nline2\n") } + File.open(file_path4, "w") { |file| file.write("line3\nline4\n") } + reading.watch_this(watch_dir) + reading.subscribe(observer) + expect(listener3.lines.sort).to eq(["line1", "line2", "line3", "line4"]) + end + + it "multiple files are read even if max_open_files is smaller then number of files" do + File.open(file_path3, "w") { |file| file.write("line1\nline2\n") } + File.open(file_path4, "w") { |file| file.write("line3\nline4\n") } + File.open(file_path5, "w") { |file| file.write("line5\nline6\n") } + reading.watch_this(watch_dir) + reading.subscribe(observer) + expect(listener3.lines.sort).to eq(["line1", "line2", "line3", "line4", "line5", "line6"]) + end + + it "file as marked as reading_completed" do + File.open(file_path3, "w") { |file| file.write("line1\nline2\n") } + reading.watch_this(watch_dir) + reading.subscribe(observer) + expect(listener3.calls).to eq([:open, :accept, :accept, :eof, :delete, :reading_completed]) + end + + it "sincedb works correctly" do + File.open(file_path3, "w") { |file| file.write("line1\nline2\n") } + reading.watch_this(watch_dir) + reading.subscribe(observer) + sincedb_record_fields = File.read(sincedb_path).split(" ") + position_field_index = 3 + expect(sincedb_record_fields[position_field_index]).to eq("12") + end + + it "does not include new files added after start" do + File.open(file_path3, "w") { |file| file.write("line1\nline2\n") } + reading.watch_this(watch_dir) + reading.subscribe(observer) + File.open(file_path6, "w") { |file| file.write("foob\nbar\n") } + expect(listener3.lines).to eq(["line1", "line2"]) + expect(listener3.calls).to eq([:open, :accept, :accept, :eof, :delete, :reading_completed]) + expect(listener6.calls).to eq([]) + end + + end + + describe "reading fixtures" do + let(:directory) { FIXTURE_DIR } + let(:actions) do + RSpec::Sequencing.run("watch") do + reading.watch_this(watch_dir) + end + .then("wait") do + wait(1).for{listener1.calls.last}.to eq(:delete) + end + .then("quit") do + reading.quit + end + end + context "for an uncompressed file" do + let(:watch_dir) { ::File.join(directory, "unc*.log") } + let(:file_path) { ::File.join(directory, 'uncompressed.log') } + + it "the file is read" do + FileWatch.make_fixture_current(file_path) + actions.activate_quietly + reading.subscribe(observer) + actions.assert_no_errors + expect(listener1.calls).to eq([:open, :accept, :accept, :eof, :delete]) + expect(listener1.lines.size).to eq(2) + end + end + + context "for another uncompressed file" do + let(:watch_dir) { ::File.join(directory, "invalid*.log") } + let(:file_path) { ::File.join(directory, 'invalid_utf8.gbk.log') } + + it "the file is read" do + FileWatch.make_fixture_current(file_path) + actions.activate_quietly + reading.subscribe(observer) + actions.assert_no_errors + expect(listener1.calls).to eq([:open, :accept, :accept, :eof, :delete]) + expect(listener1.lines.size).to eq(2) + end + end + + context "for a compressed file" do + let(:watch_dir) { ::File.join(directory, "compressed.*.gz") } + let(:file_path) { ::File.join(directory, 'compressed.log.gz') } + + it "the file is read" do + FileWatch.make_fixture_current(file_path) + actions.activate_quietly + reading.subscribe(observer) + actions.assert_no_errors + expect(listener1.calls).to eq([:open, :accept, :accept, :eof, :delete]) + expect(listener1.lines.size).to eq(2) + end + end + end + end +end diff --git a/spec/filewatch/rotate_spec.rb b/spec/filewatch/rotate_spec.rb new file mode 100644 index 00000000..cdef967d --- /dev/null +++ b/spec/filewatch/rotate_spec.rb @@ -0,0 +1,496 @@ +# encoding: utf-8 +require 'stud/temporary' +require_relative 'spec_helper' +require 'filewatch/observing_tail' + +# simulate size based rotation ala +# See https://docs.python.org/2/library/logging.handlers.html#rotatingfilehandler +# The specified file is opened and used as the stream for logging. +# If mode is not specified, 'a' is used. If encoding is not None, it is used to +# open the file with that encoding. If delay is true, then file opening is deferred +# until the first call to emit(). By default, the file grows indefinitely. +# You can use the maxBytes and backupCount values to allow the file to rollover +# at a predetermined size. When the size is about to be exceeded, the file is +# closed and a new file is silently opened for output. Rollover occurs whenever +# the current log file is nearly maxBytes in length; if either of maxBytes or +# backupCount is zero, rollover never occurs. If backupCount is non-zero, the +# system will save old log files by appending the extensions ‘.1’, ‘.2’ etc., +# to the filename. For example, with a backupCount of 5 and a base file name of +# app.log, you would get app.log, app.log.1, app.log.2, up to app.log.5. +# The file being written to is always app.log. When this file is filled, it is +# closed and renamed to app.log.1, and if files app.log.1, app.log.2, etc. +# exist, then they are renamed to app.log.2, app.log.3 etc. respectively. + +module FileWatch + describe Watch, :unix => true do + let(:directory) { Pathname.new(Stud::Temporary.directory) } + let(:file1_path) { file_path.to_path } + let(:max) { 4095 } + let(:stat_interval) { 0.01 } + let(:discover_interval) { 15 } + let(:start_new_files_at) { :end } + let(:sincedb_path) { directory.join("tailing.sdb") } + let(:opts) do + { + :stat_interval => stat_interval, :start_new_files_at => start_new_files_at, :max_open_files => max, + :delimiter => "\n", :discover_interval => discover_interval, :sincedb_path => sincedb_path.to_path + } + end + let(:observer) { TestObserver.new } + let(:tailing) { ObservingTail.new(opts) } + let(:line1) { "Line 1 - Lorem ipsum dolor sit amet, consectetur adipiscing elit." } + let(:line2) { "Line 2 - Proin ut orci lobortis, congue diam in, dictum est." } + let(:line3) { "Line 3 - Sed vestibulum accumsan sollicitudin." } + + before do + directory + wait(1.0).for{Dir.exist?(directory)}.to eq(true) + end + + after do + FileUtils.rm_rf(directory) + wait(1.0).for{Dir.exist?(directory)}.to eq(false) + end + + context "create + rename rotation: when a new logfile is renamed to a path we have seen before and the open file is fully read, renamed outside glob" do + let(:watch_dir) { directory.join("*A.log") } + let(:file_path) { directory.join("1A.log") } + subject { described_class.new(conf) } + let(:listener1) { observer.listener_for(file1_path) } + let(:listener2) { observer.listener_for(second_file.to_path) } + let(:actions) do + RSpec::Sequencing + .run_after(0.25, "create file") do + file_path.open("wb") { |file| file.write("#{line1}\n") } + end + .then_after(0.25, "write a 'unfinished' line") do + file_path.open("ab") { |file| file.write(line2) } + end + .then_after(0.25, "rotate once") do + tmpfile = directory.join("1.logtmp") + tmpfile.open("wb") { |file| file.write("\n#{line3}\n")} + file_path.rename(directory.join("1.log.1")) + FileUtils.mv(directory.join("1.logtmp").to_path, file1_path) + end + .then("wait for expectation") do + sleep(0.25) # if ENV['CI'] + wait(2).for { listener1.calls }.to eq([:open, :accept, :accept, :accept]) + end + .then("quit") do + tailing.quit + end + end + + it "content from both inodes are sent via the same stream" do + actions.activate_quietly + tailing.watch_this(watch_dir.to_path) + tailing.subscribe(observer) + actions.assert_no_errors + lines = listener1.lines + expect(lines[0]).to eq(line1) + expect(lines[1]).to eq(line2) + expect(lines[2]).to eq(line3) + end + end + + context "create + rename rotation: a multiple file rename cascade" do + let(:watch_dir) { directory.join("*B.log") } + let(:file_path) { directory.join("1B.log") } + subject { described_class.new(conf) } + let(:second_file) { directory.join("2B.log") } + let(:third_file) { directory.join("3B.log") } + let(:listener1) { observer.listener_for(file1_path) } + let(:listener2) { observer.listener_for(second_file.to_path) } + let(:listener3) { observer.listener_for(third_file.to_path) } + let(:actions) do + RSpec::Sequencing + .run_after(0.25, "create file") do + file_path.open("wb") { |file| file.write("#{line1}\n") } + end + .then_after(0.25, "rotate 1 - line1(66) is in 2B.log, line2(61) is in 1B.log") do + file_path.rename(second_file) + file_path.open("wb") { |file| file.write("#{line2}\n") } + end + .then_after(0.25, "rotate 2 - line1(66) is in 3B.log, line2(61) is in 2B.log, line3(47) is in 1B.log") do + second_file.rename(third_file) + file_path.rename(second_file) + file_path.open("wb") { |file| file.write("#{line3}\n") } + end + .then("wait for expectations to be met") do + wait(0.75).for{listener1.lines.size == 3 && listener3.lines.empty? && listener2.lines.empty?}.to eq(true) + end + .then("quit") do + tailing.quit + end + end + + it "content from both inodes are sent via the same stream" do + actions.activate_quietly + tailing.watch_this(watch_dir.to_path) + tailing.subscribe(observer) + actions.assert_no_errors + expect(listener1.lines[0]).to eq(line1) + expect(listener1.lines[1]).to eq(line2) + expect(listener1.lines[2]).to eq(line3) + end + end + + context "create + rename rotation: a two file rename cascade in slow motion" do + let(:watch_dir) { directory.join("*C.log") } + let(:file_path) { directory.join("1C.log") } + let(:stat_interval) { 0.01 } + subject { described_class.new(conf) } + let(:second_file) { directory.join("2C.log") } + let(:listener1) { observer.listener_for(file1_path) } + let(:listener2) { observer.listener_for(second_file.to_path) } + let(:actions) do + RSpec::Sequencing + .run_after(0.25, "create original - write line 1, 66 bytes") do + file_path.open("wb") { |file| file.write("#{line1}\n") } + end + .then_after(0.25, "rename to 2.log") do + file_path.rename(second_file) + end + .then_after(0.25, "write line 2 to original, 61 bytes") do + file_path.open("wb") { |file| file.write("#{line2}\n") } + end + .then_after(0.25, "rename to 2.log again") do + file_path.rename(second_file) + end + .then_after(0.25, "write line 3 to original, 47 bytes") do + file_path.open("wb") { |file| file.write("#{line3}\n") } + end + .then("wait for expectations to be met") do + wait(1).for{listener1.lines.size == 3 && listener2.lines.empty?}.to eq(true) + end + .then("quit") do + tailing.quit + end + end + + it "content from both inodes are sent via the same stream AND content from the rotated file is not read again" do + actions.activate_quietly + tailing.watch_this(watch_dir.to_path) + tailing.subscribe(observer) + actions.assert_no_errors + expect(listener1.lines[0]).to eq(line1) + expect(listener1.lines[1]).to eq(line2) + expect(listener1.lines[2]).to eq(line3) + end + end + + context "create + rename rotation: a two file rename cascade in normal speed" do + let(:watch_dir) { directory.join("*D.log") } + let(:file_path) { directory.join("1D.log") } + subject { described_class.new(conf) } + let(:second_file) { directory.join("2D.log") } + let(:listener1) { observer.listener_for(file1_path) } + let(:listener2) { observer.listener_for(second_file.to_path) } + let(:actions) do + RSpec::Sequencing + .run_after(0.25, "create original - write line 1, 66 bytes") do + file_path.open("wb") { |file| file.write("#{line1}\n") } + end + .then_after(0.25, "rename to 2.log") do + file_path.rename(second_file) + file_path.open("wb") { |file| file.write("#{line2}\n") } + end + .then_after(0.25, "rename to 2.log again") do + file_path.rename(second_file) + file_path.open("wb") { |file| file.write("#{line3}\n") } + end + .then("wait for expectations to be met") do + wait(0.5).for{listener1.lines.size == 3 && listener2.lines.empty?}.to eq(true) + end + .then("quit") do + tailing.quit + end + end + + it "content from both inodes are sent via the same stream AND content from the rotated file is not read again" do + actions.activate_quietly + tailing.watch_this(watch_dir.to_path) + tailing.subscribe(observer) + actions.assert_no_errors + expect(listener1.lines[0]).to eq(line1) + expect(listener1.lines[1]).to eq(line2) + expect(listener1.lines[2]).to eq(line3) + end + end + + context "create + rename rotation: when a new logfile is renamed to a path we have seen before but not all content from the previous the file is read" do + let(:opts) { super().merge( + :file_chunk_size => line1.bytesize.succ, + :file_chunk_count => 1 + ) } + let(:watch_dir) { directory.join("*E.log") } + let(:file_path) { directory.join("1E.log") } + subject { described_class.new(conf) } + let(:listener1) { observer.listener_for(file1_path) } + let(:actions) do + RSpec::Sequencing + .run_after(0.25, "create file") do + file_path.open("wb") do |file| + 65.times{file.puts(line1)} + end + end + .then_after(0.25, "rotate") do + tmpfile = directory.join("1E.logtmp") + tmpfile.open("wb") { |file| file.puts(line1)} + file_path.rename(directory.join("1E.log.1")) + tmpfile.rename(directory.join("1E.log")) + end + .then("wait for expectations to be met") do + wait(0.5).for{listener1.lines.size}.to eq(66) + end + .then("quit") do + tailing.quit + end + end + + it "content from both inodes are sent via the same stream" do + actions.activate_quietly + tailing.watch_this(watch_dir.to_path) + tailing.subscribe(observer) + actions.assert_no_errors + expected_calls = ([:accept] * 66).unshift(:open) + expect(listener1.lines.uniq).to eq([line1]) + expect(listener1.calls).to eq(expected_calls) + expect(sincedb_path.readlines.size).to eq(2) + end + end + + context "copy + truncate rotation: when a logfile is copied to a new path and truncated and the open file is fully read" do + let(:watch_dir) { directory.join("*F.log") } + let(:file_path) { directory.join("1F.log") } + subject { described_class.new(conf) } + let(:listener1) { observer.listener_for(file1_path) } + let(:actions) do + RSpec::Sequencing + .run_after(0.25, "create file") do + file_path.open("wb") { |file| file.puts(line1); file.puts(line2) } + end + .then_after(0.25, "rotate") do + FileUtils.cp(file1_path, directory.join("1F.log.1").to_path) + file_path.truncate(0) + end + .then_after(0.25, "write to truncated file") do + file_path.open("wb") { |file| file.puts(line3) } + end + .then("wait for expectations to be met") do + wait(0.5).for{listener1.lines.size}.to eq(3) + end + .then("quit") do + tailing.quit + end + end + + it "content is read correctly" do + actions.activate_quietly + tailing.watch_this(watch_dir.to_path) + tailing.subscribe(observer) + actions.assert_no_errors + expect(listener1.lines).to eq([line1, line2, line3]) + expect(listener1.calls).to eq([:open, :accept, :accept, :accept]) + end + end + + context "copy + truncate rotation: when a logfile is copied to a new path and truncated before the open file is fully read" do + let(:opts) { super().merge( + :file_chunk_size => line1.bytesize.succ, + :file_chunk_count => 1 + ) } + let(:watch_dir) { directory.join("*G.log") } + let(:file_path) { directory.join("1G.log") } + subject { described_class.new(conf) } + let(:listener1) { observer.listener_for(file1_path) } + let(:actions) do + RSpec::Sequencing + .run_after(0.25, "create file") do + file_path.open("wb") { |file| 65.times{file.puts(line1)} } + end + .then_after(0.25, "rotate") do + FileUtils.cp(file1_path, directory.join("1G.log.1").to_path) + file_path.truncate(0) + end + .then_after(0.25, "write to truncated file") do + file_path.open("wb") { |file| file.puts(line3) } + end + .then("wait for expectations to be met") do + wait(0.5).for{listener1.lines.last}.to eq(line3) + end + .then("quit") do + tailing.quit + end + end + + it "unread content before the truncate is lost" do + actions.activate_quietly + tailing.watch_this(watch_dir.to_path) + tailing.subscribe(observer) + actions.assert_no_errors + expect(listener1.lines.size).to be < 66 + end + end + + context "? rotation: when an active file is renamed inside the glob and the reading does not lag" do + let(:watch_dir) { directory.join("*H.log") } + let(:file_path) { directory.join("1H.log") } + let(:file2) { directory.join("2H.log") } + subject { described_class.new(conf) } + let(:listener1) { observer.listener_for(file1_path) } + let(:listener2) { observer.listener_for(file2.to_path) } + let(:actions) do + RSpec::Sequencing + .run_after(0.25, "create file") do + file_path.open("wb") { |file| file.puts(line1); file.puts(line2) } + end + .then_after(0.25, "rename") do + FileUtils.mv(file1_path, file2.to_path) + end + .then_after(0.25, "write to renamed file") do + file2.open("ab") { |file| file.puts(line3) } + end + .then("wait for expectations to be met") do + wait(0.75).for{listener1.lines.size + listener2.lines.size}.to eq(3) + end + .then("quit") do + tailing.quit + end + end + + it "content is read correctly, the renamed file is not reread from scratch" do + actions.activate_quietly + tailing.watch_this(watch_dir.to_path) + tailing.subscribe(observer) + actions.assert_no_errors + expect(listener1.lines).to eq([line1, line2]) + expect(listener2.lines).to eq([line3]) + end + end + + context "? rotation: when an active file is renamed inside the glob and the reading lags behind" do + let(:opts) { super().merge( + :file_chunk_size => line1.bytesize.succ, + :file_chunk_count => 2 + ) } + let(:watch_dir) { directory.join("*I.log") } + let(:file_path) { directory.join("1I.log") } + let(:file2) { directory.join("2I.log") } + subject { described_class.new(conf) } + let(:listener1) { observer.listener_for(file1_path) } + let(:listener2) { observer.listener_for(file2.to_path) } + let(:actions) do + RSpec::Sequencing + .run_after(0.25, "create file") do + file_path.open("wb") { |file| 65.times{file.puts(line1)} } + end + .then_after(0.25, "rename") do + FileUtils.mv(file1_path, file2.to_path) + end + .then_after(0.25, "write to renamed file") do + file2.open("ab") { |file| file.puts(line3) } + end + .then("wait for expectations to be met") do + wait(1.25).for{listener1.lines.size + listener2.lines.size}.to eq(66) + end + .then("quit") do + tailing.quit + end + end + + it "content is read correctly, the renamed file is not reread from scratch" do + actions.activate_quietly + tailing.watch_this(watch_dir.to_path) + tailing.subscribe(observer) + actions.assert_no_errors + expect(listener2.lines.last).to eq(line3) + end + end + + context "? rotation: when a not active file is rotated outside the glob before the file is read" do + let(:opts) { super().merge( + :close_older => 3600, + :max_open_files => 1, + :file_sort_by => "path" + ) } + let(:watch_dir) { directory.join("*J.log") } + let(:file_path) { directory.join("1J.log") } + let(:file2) { directory.join("2J.log") } + let(:file3) { directory.join("2J.log.1") } + let(:listener1) { observer.listener_for(file1_path) } + let(:listener2) { observer.listener_for(file2.to_path) } + let(:listener3) { observer.listener_for(file3.to_path) } + subject { described_class.new(conf) } + let(:actions) do + RSpec::Sequencing + .run_after(0.25, "create file") do + file_path.open("wb") { |file| 65.times{file.puts(line1)} } + file2.open("wb") { |file| 65.times{file.puts(line1)} } + end + .then_after(0.25, "rename") do + FileUtils.mv(file2.to_path, file3.to_path) + end + .then("wait for expectations to be met") do + wait(1.25).for{listener1.lines.size}.to eq(65) + end + .then("quit") do + tailing.quit + end + end + + it "file 1 content is read correctly, the renamed file 2 is not read at all" do + actions.activate_quietly + tailing.watch_this(watch_dir.to_path) + tailing.subscribe(observer) + actions.assert_no_errors + expect(listener2.lines.size).to eq(0) + expect(listener3.lines.size).to eq(0) + end + end + + context "? rotation: when an active file is renamed inside the glob - issue 214" do + let(:watch_dir) { directory.join("*L.log") } + let(:file_path) { directory.join("1L.log") } + let(:second_file) { directory.join("2L.log") } + subject { described_class.new(conf) } + let(:listener1) { observer.listener_for(file1_path) } + let(:listener2) { observer.listener_for(second_file.to_path) } + let(:stat_interval) { 0.25 } + let(:discover_interval) { 1 } + let(:line4) { "Line 4 - Some other non lorem ipsum content" } + let(:actions) do + RSpec::Sequencing + .run_after(0.75, "create file") do + file_path.open("wb") { |file| file.puts(line1); file.puts(line2) } + end + .then_after(0.5, "rename") do + file_path.rename(second_file) + file_path.open("wb") { |file| file.puts("#{line3}") } + end + .then("wait for expectations to be met") do + wait(2.0).for{listener1.lines.size + listener2.lines.size}.to eq(3) + end + .then_after(0.5, "rename again") do + file_path.rename(second_file) + file_path.open("wb") { |file| file.puts("#{line4}") } + end + .then("wait for expectations to be met") do + wait(2.0).for{listener1.lines.size + listener2.lines.size}.to eq(4) + end + .then("quit") do + tailing.quit + end + end + + it "content is read correctly, the renamed file is not reread from scratch" do + actions.activate_quietly + tailing.watch_this(watch_dir.to_path) + tailing.subscribe(observer) + actions.assert_no_errors + expect(listener1.lines).to eq([line1, line2, line3, line4]) + expect(listener2.lines).to eq([]) + end + end + end +end diff --git a/spec/filewatch/settings_spec.rb b/spec/filewatch/settings_spec.rb new file mode 100644 index 00000000..fc1f325f --- /dev/null +++ b/spec/filewatch/settings_spec.rb @@ -0,0 +1,14 @@ +require 'logstash/devutils/rspec/spec_helper' +require 'logstash/inputs/friendly_durations' + +describe FileWatch::Settings do + + context "when create from options" do + it "doesn't convert sincedb_clean_after to seconds" do + res = FileWatch::Settings.from_options({:sincedb_clean_after => LogStash::Inputs::FriendlyDurations.call(1, "days").value}) + + expect(res.sincedb_expiry_duration).to eq 1 * 24 * 3600 + end + end + +end diff --git a/spec/filewatch/sincedb_record_serializer_spec.rb b/spec/filewatch/sincedb_record_serializer_spec.rb new file mode 100644 index 00000000..007dd1eb --- /dev/null +++ b/spec/filewatch/sincedb_record_serializer_spec.rb @@ -0,0 +1,100 @@ +# encoding: utf-8 +require_relative 'spec_helper' +require 'filewatch/settings' +require 'filewatch/sincedb_record_serializer' + +module FileWatch + describe SincedbRecordSerializer do + let(:opts) { Hash.new } + let(:io) { StringIO.new } + let(:db) { Hash.new } + + let(:sincedb_value_expiry) { SincedbRecordSerializer.days_to_seconds(14) } + + subject { SincedbRecordSerializer.new(sincedb_value_expiry) } + + context "deserialize from IO" do + it 'reads V1 records' do + io.write("5391297 1 4 12\n") + io.rewind + rows = 0 + subject.deserialize(io) do |inode_struct, sincedb_value| + expect(inode_struct.inode).to eq("5391297") + expect(inode_struct.maj).to eq(1) + expect(inode_struct.min).to eq(4) + expect(sincedb_value.position).to eq(12) + rows += 1 + end + expect(rows).to be > 0 + end + + it 'reads V2 records from an IO object' do + now = Time.now.to_f + io.write("5391298 1 4 12 #{now} /a/path/to/1.log\n") + io.rewind + rows = 0 + subject.deserialize(io) do |inode_struct, sincedb_value| + expect(inode_struct.inode).to eq("5391298") + expect(inode_struct.maj).to eq(1) + expect(inode_struct.min).to eq(4) + expect(sincedb_value.position).to eq(12) + expect(sincedb_value.last_changed_at).to eq(now) + expect(sincedb_value.path_in_sincedb).to eq("/a/path/to/1.log") + rows += 1 + end + expect(rows).to be > 0 + end + + it 'properly handles spaces in a filename' do + now = Time.now.to_f + io.write("53912987 1 4 12 #{now} /a/path/to/log log.log\n") + io.rewind + rows = 0 + subject.deserialize(io) do |inode_struct, sincedb_value| + expect(inode_struct.inode).to eq("53912987") + expect(inode_struct.maj).to eq(1) + expect(inode_struct.min).to eq(4) + expect(sincedb_value.position).to eq(12) + expect(sincedb_value.last_changed_at).to eq(now) + expect(sincedb_value.path_in_sincedb).to eq("/a/path/to/log log.log") + rows += 1 + end + expect(rows).to be > 0 + end + end + + context "serialize to IO" do + it "writes db entries" do + now = Time.now.to_f + inode_struct = InodeStruct.new("42424242", 2, 5) + sincedb_value = SincedbValue.new(42, now) + db[inode_struct] = sincedb_value + subject.serialize(db, io) + expect(io.string).to eq("42424242 2 5 42 #{now}\n") + end + + it "does not write expired db entries to an IO object" do + twelve_days_ago = Time.now.to_f - (12.0*24*3600) + sixteen_days_ago = twelve_days_ago - (4.0*24*3600) + db[InodeStruct.new("42424242", 2, 5)] = SincedbValue.new(42, twelve_days_ago) + db[InodeStruct.new("18181818", 1, 6)] = SincedbValue.new(99, sixteen_days_ago) + subject.serialize(db, io) + expect(io.string).to eq("42424242 2 5 42 #{twelve_days_ago}\n") + end + end + + context "given a non default `sincedb_clean_after`" do + + let(:sincedb_value_expiry) { SincedbRecordSerializer.days_to_seconds(2) } + + it "does not write expired db entries to an IO object" do + one_day_ago = Time.now.to_f - (1.0*24*3600) + three_days_ago = one_day_ago - (2.0*24*3600) + db[InodeStruct.new("42424242", 2, 5)] = SincedbValue.new(42, one_day_ago) + db[InodeStruct.new("18181818", 1, 6)] = SincedbValue.new(99, three_days_ago) + subject.serialize(db, io) + expect(io.string).to eq("42424242 2 5 42 #{one_day_ago}\n") + end + end + end +end \ No newline at end of file diff --git a/spec/filewatch/spec_helper.rb b/spec/filewatch/spec_helper.rb new file mode 100644 index 00000000..f6c6c98a --- /dev/null +++ b/spec/filewatch/spec_helper.rb @@ -0,0 +1,179 @@ +# encoding: utf-8 +require "rspec_sequencing" +require 'rspec/wait' +require "logstash/devutils/rspec/spec_helper" +require "concurrent" +require "timecop" + +def formatted_puts(text) + cfg = RSpec.configuration + return unless cfg.formatters.first.is_a?( + RSpec::Core::Formatters::DocumentationFormatter) + txt = cfg.format_docstrings_block.call(text) + cfg.output_stream.puts " #{txt}" +end + +unless RSpec::Matchers.method_defined?(:receive_call_and_args) + RSpec::Matchers.define(:receive_call_and_args) do |m, args| + match do |actual| + actual.trace_for(m) == args + end + + failure_message do + "Expecting method #{m} to receive: #{args} but got: #{actual.trace_for(m)}" + end + end +end + +require_relative "../helpers/rspec_wait_handler_helper" unless defined? RSPEC_WAIT_HANDLER_PATCHED +require_relative "../helpers/logging_level_helper" unless defined? LOG_AT_HANDLED + +require 'filewatch/bootstrap' + +module FileWatch + class DummyIO + def stat + self + end + def ino + 23456 + end + def size + 65535 + end + def mtime + Time.now + end + def dev_major + 1 + end + def dev_minor + 5 + end + end + + class DummyFileReader + def initialize(read_size, iterations) + @read_size = read_size + @iterations = iterations + @closed = false + @accumulated = 0 + @io = DummyIO.new + end + def file_seek(*) + end + def close() + @closed = true + end + def closed? + @closed + end + def to_io + @io + end + def sysread(amount) + @accumulated += amount + if @accumulated > @read_size * @iterations + raise EOFError.new + end + string = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcde\n" + multiplier = amount / string.length + string * multiplier + end + def sysseek(offset, whence) + end + end + + FIXTURE_DIR = File.join('spec', 'fixtures') + + def self.make_file_older(path, seconds) + time = Time.now.to_f - seconds + ::File.utime(time, time, path) + end + + def self.make_fixture_current(path, time = Time.now) + ::File.utime(time, time, path) + end + + class TracerBase + def initialize + @tracer = Concurrent::Array.new + end + + def trace_for(symbol) + params = @tracer.map {|k,v| k == symbol ? v : nil}.compact + params.empty? ? false : params + end + + def clear + @tracer.clear + end + end + + module NullCallable + def self.call + end + end + + class TestObserver + class Listener + attr_reader :path, :lines, :calls + + def initialize(path, lines) + @path = path + @lines = lines || Concurrent::Array.new + @calls = Concurrent::Array.new + end + + def accept(line) + @lines << line + @calls << :accept + end + + def deleted + @calls << :delete + end + + def opened + @calls << :open + end + + def error + @calls << :error + end + + def eof + @calls << :eof + end + + def timed_out + @calls << :timed_out + end + + def reading_completed + @calls << :reading_completed + end + end + + attr_reader :listeners + + def initialize(combined_lines = nil) + @listeners = Concurrent::Hash.new { |hash, key| hash[key] = new_listener(key, combined_lines) } + end + + def listener_for(path) + @listeners[path] + end + + def clear + @listeners.clear + end + + private + + def new_listener(path, lines = nil) + Listener.new(path, lines) + end + + end +end diff --git a/spec/filewatch/tailing_spec.rb b/spec/filewatch/tailing_spec.rb new file mode 100644 index 00000000..fbc4f9e4 --- /dev/null +++ b/spec/filewatch/tailing_spec.rb @@ -0,0 +1,585 @@ +# encoding: utf-8 +require 'stud/temporary' +require_relative 'spec_helper' +require 'filewatch/observing_tail' + +module FileWatch + describe Watch do + let(:directory) { Stud::Temporary.directory } + let(:watch_dir) { ::File.join(directory, "*#{suffix}.log") } + let(:file_path) { ::File.join(directory, "1#{suffix}.log") } + let(:file_path2) { ::File.join(directory, "2#{suffix}.log") } + let(:file_path3) { ::File.join(directory, "3#{suffix}.log") } + let(:max) { 4095 } + let(:stat_interval) { 0.1 } + let(:discover_interval) { 4 } + let(:start_new_files_at) { :end } + let(:sincedb_path) { ::File.join(directory, "tailing.sdb") } + let(:opts) do + { + :stat_interval => stat_interval, + :start_new_files_at => start_new_files_at, + :max_open_files => max, + :delimiter => "\n", + :discover_interval => discover_interval, + :sincedb_path => sincedb_path, + :file_sort_by => "path" + } + end + let(:observer) { TestObserver.new } + let(:listener1) { observer.listener_for(file_path) } + let(:listener2) { observer.listener_for(file_path2) } + let(:listener3) { observer.listener_for(file_path3) } + let(:tailing) { ObservingTail.new(opts) } + + before do + directory + wait(1.0).for { Dir.exist?(directory) }.to eq(true) + end + + after do + FileUtils.rm_rf(directory) + end + + describe "max open files (set to 1)" do + let(:max) { 1 } + let(:wait_before_quit) { 0.15 } + let(:stat_interval) { 0.01 } + let(:discover_interval) { 4 } + let(:start_new_files_at) { :beginning } + let(:actions) do + RSpec::Sequencing + .run_after(wait_before_quit, "quit after a short time") do + tailing.quit + end + end + + before do + ENV["FILEWATCH_MAX_FILES_WARN_INTERVAL"] = "0" + File.open(file_path, "wb") { |file| file.write("line1\nline2\n") } + File.open(file_path2, "wb") { |file| file.write("line-A\nline-B\n") } + end + + context "when max_active is 1" do + let(:suffix) { "A" } + it "without close_older set, opens only 1 file" do + actions.activate_quietly + # create files before first discovery, they will be read from the end + tailing.watch_this(watch_dir) + tailing.subscribe(observer) + actions.assert_no_errors + expect(tailing.settings.max_active).to eq(max) + expect(listener1.lines).to eq(["line1", "line2"]) + expect(listener1.calls).to eq([:open, :accept, :accept]) + expect(listener2.calls).to be_empty + end + end + + context "when close_older is set" do + let(:wait_before_quit) { 0.8 } + let(:opts) { super().merge(:close_older => 0.1, :max_open_files => 1, :stat_interval => 0.1) } + let(:suffix) { "B" } + it "opens both files" do + actions.activate_quietly + tailing.watch_this(watch_dir) + tailing.subscribe(observer) + actions.assert_no_errors + expect(tailing.settings.max_active).to eq(1) + expect(listener2.calls).to eq([:open, :accept, :accept, :timed_out]) + expect(listener2.lines).to eq(["line-A", "line-B"]) + expect(listener1.calls).to eq([:open, :accept, :accept, :timed_out]) + expect(listener1.lines).to eq(["line1", "line2"]) + end + end + end + + context "when watching a directory with files, existing content is skipped" do + let(:suffix) { "C" } + let(:actions) do + RSpec::Sequencing + .run("create file") do + File.open(file_path, "wb") { |file| file.write("lineA\nlineB\n") } + end + .then_after(0.1, "begin watching") do + tailing.watch_this(watch_dir) + end + .then_after(1.0, "add content") do + File.open(file_path, "ab") { |file| file.write("line1\nline2\n") } + end + .then("wait") do + wait(0.75).for { listener1.lines }.to_not be_empty + end + .then("quit") do + tailing.quit + end + end + + it "only the new content is read" do + actions.activate_quietly + tailing.subscribe(observer) + actions.assert_no_errors + expect(listener1.calls).to eq([:open, :accept, :accept]) + expect(listener1.lines).to eq(["line1", "line2"]) + end + end + + context "when watching a directory without files and one is added" do + let(:suffix) { "D" } + let(:actions) do + RSpec::Sequencing + .run("begin watching") do + tailing.watch_this(watch_dir) + end + .then_after(0.1, "create file") do + File.open(file_path, "wb") { |file| file.write("line1\nline2\n") } + end + .then("wait") do + wait(0.75).for { listener1.lines }.to_not be_empty + end + .then("quit") do + tailing.quit + end + end + + it "the file is read from the beginning" do + actions.activate_quietly + tailing.subscribe(observer) + actions.assert_no_errors + expect(listener1.calls).to eq([:open, :accept, :accept]) + expect(listener1.lines).to eq(["line1", "line2"]) + end + end + + context "given a previously discovered file" do + # these tests rely on the fact that the 'filepath' does not exist on disk + # it simulates that the user deleted the file + # so when a stat is taken on the file an error is raised + let(:suffix) { "E" } + let(:quit_after) { 0.2 } + let(:stat) { double("stat", :size => 100, :modified_at => Time.now.to_f, :inode => 234567, :inode_struct => InodeStruct.new("234567", 1, 5)) } + let(:watched_file) { WatchedFile.new(file_path, stat, tailing.settings) } + before do + allow(stat).to receive(:restat).and_raise(Errno::ENOENT) + tailing.watch.watched_files_collection.add(watched_file) + watched_file.initial_completed + end + + context "when a close operation occurs" do + before { watched_file.close } + it "is removed from the watched_files_collection" do + expect(tailing.watch.watched_files_collection).not_to be_empty + RSpec::Sequencing.run_after(quit_after, "quit") { tailing.quit } + tailing.subscribe(observer) + expect(tailing.watch.watched_files_collection).to be_empty + expect(listener1.calls).to eq([:delete]) + end + end + + context "an ignore operation occurs" do + before { watched_file.ignore } + it "is removed from the watched_files_collection" do + RSpec::Sequencing.run_after(quit_after, "quit") { tailing.quit } + tailing.subscribe(observer) + expect(tailing.watch.watched_files_collection).to be_empty + expect(listener1.calls).to eq([:delete]) + end + end + + context "when subscribed and a watched file is no longer readable" do + before { watched_file.watch } + it "is removed from the watched_files_collection" do + RSpec::Sequencing.run_after(quit_after, "quit") { tailing.quit } + tailing.subscribe(observer) + expect(tailing.watch.watched_files_collection).to be_empty + expect(listener1.calls).to eq([:delete]) + end + end + + context "when subscribed and an active file is no longer readable" do + before { watched_file.activate } + it "is removed from the watched_files_collection" do + RSpec::Sequencing.run_after(quit_after, "quit") { tailing.quit } + tailing.subscribe(observer) + expect(tailing.watch.watched_files_collection).to be_empty + expect(listener1.calls).to eq([:delete]) + end + end + end + + context "when a processed file shrinks" do + let(:discover_interval) { 1 } + let(:suffix) { "F" } + let(:actions) do + RSpec::Sequencing + .run_after(0.1, "start watching") do + tailing.watch_this(watch_dir) + end + .then_after(0.1, "create file") do + # create file after first discovery, will be read from the start + File.open(file_path, "wb") { |file| file.write("line1\nline2\nline3\nline4\n") } + end + .then("wait for initial lines to be read") do + wait(0.8).for{listener1.lines.size}.to eq(4), "listener1.lines.size not eq 4" + end + .then_after(0.25, "truncate file and write new content") do + File.truncate(file_path, 0) + File.open(file_path, "ab") { |file| file.write("lineA\nlineB\n") } + wait(0.5).for{listener1.lines.size}.to eq(6), "listener1.lines.size not eq 6" + end + .then("quit") do + tailing.quit + end + end + + it "new changes to the shrunk file are read from the beginning" do + actions.activate_quietly + tailing.subscribe(observer) + actions.assert_no_errors + expect(listener1.calls).to eq([:open, :accept, :accept, :accept, :accept, :accept, :accept]) + expect(listener1.lines).to eq(["line1", "line2", "line3", "line4", "lineA", "lineB"]) + end + end + + context "when watching a directory with files and a file is renamed to not match glob", :unix => true do + let(:suffix) { "G" } + let(:new_file_path) { file_path + ".old" } + let(:new_file_listener) { observer.listener_for(new_file_path) } + let(:actions) do + RSpec::Sequencing + .run("start watching") do + tailing.watch_this(watch_dir) + end + .then_after(0.1, "create file") do + # create file after first discovery, will be read from the beginning + File.open(file_path, "wb") { |file| file.write("line1\nline2\n") } + end + .then_after(0.55, "rename file") do + FileUtils.mv(file_path, new_file_path) + end + .then_after(0.55, "then write to renamed file") do + File.open(new_file_path, "ab") { |file| file.write("line3\nline4\n") } + wait(0.5).for{listener1.lines.size}.to eq(2), "listener1.lines.size not eq(2)" + end + .then_after(0.1, "quit") do + tailing.quit + end + end + + it "changes to the renamed file are not read" do + actions.activate_quietly + tailing.subscribe(observer) + actions.assert_no_errors + expect(listener1.calls).to eq([:open, :accept, :accept, :delete]) + expect(listener1.lines).to eq(["line1", "line2"]) + expect(new_file_listener.calls).to eq([]) + expect(new_file_listener.lines).to eq([]) + end + end + + context "when watching a directory with files and a file is renamed to match glob", :unix => true do + let(:suffix) { "H" } + let(:opts) { super().merge(:close_older => 0) } + let(:listener2) { observer.listener_for(file_path2) } + let(:actions) do + RSpec::Sequencing + .run("file created") do + # create file before first discovery, will be read from the end + File.open(file_path, "wb") { |file| file.write("line1\nline2\n") } + end + .then_after(0.15, "start watching after files are written") do + tailing.watch_this(watch_dir) + end + .then("wait") do + wait(0.5).for{listener1.calls.last}.to eq(:timed_out) + end + .then("rename file") do + FileUtils.mv(file_path, file_path2) + end + .then_after(0.1, "then write to renamed file") do + File.open(file_path2, "ab") { |file| file.write("line3\nline4\n") } + end + .then_after(0.1, "wait for lines") do + wait(0.5).for{listener2.lines.size}.to eq(2) + end + .then_after(0.1, "quit") do + tailing.quit + end + end + + it "the first set of lines are not re-read" do + actions.activate_quietly + tailing.subscribe(observer) + actions.assert_no_errors + expect(listener1.lines).to eq([]) + expect(listener1.calls).to eq([:open, :timed_out, :delete]) + expect(listener2.lines).to eq(["line3", "line4"]) + expect(listener2.calls).to eq([:open, :accept, :accept, :timed_out]) + end + end + + context "when watching a directory with files and data is appended" do + let(:suffix) { "I" } + let(:actions) do + RSpec::Sequencing + .run("file created") do + File.open(file_path, "wb") { |file| file.write("line1\nline2\n") } + end + .then_after(0.15, "start watching after file is written") do + tailing.watch_this(watch_dir) + end + .then_after(0.45, "append more lines to the file") do + File.open(file_path, "ab") { |file| file.write("line3\nline4\n") } + wait(0.5).for{listener1.lines.size}.to eq(2) + end + .then_after(0.1, "quit") do + tailing.quit + end + end + + it "appended lines are read only" do + actions.activate_quietly + tailing.subscribe(observer) + actions.assert_no_errors + expect(listener1.calls).to eq([:open, :accept, :accept]) + expect(listener1.lines).to eq(["line3", "line4"]) + end + end + + context "when close older expiry is enabled" do + let(:opts) { super().merge(:close_older => 1) } + let(:suffix) { "J" } + let(:actions) do + RSpec::Sequencing.run("create file") do + File.open(file_path, "wb") { |file| file.write("line1\nline2\n") } + end + .then("watch and wait") do + tailing.watch_this(watch_dir) + wait(1.25).for{listener1.calls}.to eq([:open, :timed_out]) + end + .then("quit") do + tailing.quit + end + end + + it "existing lines are not read and the file times out" do + actions.activate_quietly + tailing.subscribe(observer) + actions.assert_no_errors + expect(listener1.lines).to eq([]) + end + end + + context "when close older expiry is enabled and after timeout the file is appended-to" do + let(:opts) { super().merge(:close_older => 0.5) } + let(:suffix) { "K" } + let(:actions) do + RSpec::Sequencing + .run("start watching") do + tailing.watch_this(watch_dir) + end + .then("create file") do + File.open(file_path, "wb") { |file| file.write("line1\nline2\n") } + end + .then("wait for file to be read") do + wait(0.5).for{listener1.calls}.to eq([:open, :accept, :accept]), "file is not read" + end + .then("wait for file to be read and time out") do + wait(0.75).for{listener1.calls}.to eq([:open, :accept, :accept, :timed_out]), "file did not timeout the first time" + end + .then("append more lines to file after file ages more than close_older") do + File.open(file_path, "ab") { |file| file.write("line3\nline4\n") } + end + .then("wait for last timeout") do + wait(0.75).for{listener1.calls}.to eq([:open, :accept, :accept, :timed_out, :open, :accept, :accept, :timed_out]), "file did not timeout the second time" + end + .then("quit") do + tailing.quit + end + end + + it "all lines are read" do + actions.activate_quietly + tailing.subscribe(observer) + actions.assert_no_errors + expect(listener1.lines).to eq(["line1", "line2", "line3", "line4"]) + end + end + + context "when ignore older expiry is enabled and all files are already expired" do + let(:opts) { super().merge(:ignore_older => 1) } + let(:suffix) { "L" } + let(:actions) do + RSpec::Sequencing + .run("create file older than ignore_older and watch") do + File.open(file_path, "wb") { |file| file.write("line1\nline2\n") } + FileWatch.make_file_older(file_path, 15) + tailing.watch_this(watch_dir) + end + .then_after(1.1, "quit") do + tailing.quit + end + end + + it "no files are read" do + actions.activate_quietly + tailing.subscribe(observer) + expect(listener1.calls).to eq([]) + expect(listener1.lines).to eq([]) + end + end + + context "when a file is renamed before it gets activated", :unix => true do + let(:max) { 1 } + let(:opts) { super().merge(:file_chunk_count => 8, :file_chunk_size => 6, :close_older => 0.1, :discover_interval => 6) } + let(:suffix) { "M" } + let(:start_new_files_at) { :beginning } # we are creating files and sincedb record before hand + let(:actions) do + RSpec::Sequencing + .run("create files and sincedb record") do + File.open(file_path, "wb") { |file| 32.times{file.write("line1\n")} } + File.open(file_path2, "wb") { |file| file.write("line2\n") } + # synthesize a sincedb record + stat = File.stat(file_path2) + record = [stat.ino.to_s, stat.dev_major.to_s, stat.dev_minor.to_s, "0", "1526220348.083179", file_path2] + File.open(sincedb_path, "wb") { |file| file.puts(record.join(" ")) } + end + .then_after(0.2, "watch") do + tailing.watch_this(watch_dir) + end + .then_after(0.1, "rename file 2") do + FileUtils.mv(file_path2, file_path3) + end + .then("wait") do + wait(4).for do + listener1.lines.size == 32 && listener2.calls == [:delete] && listener3.calls == [:open, :accept, :timed_out] + end.to eq(true), "listener1.lines != 32 or listener2.calls != [:delete] or listener3.calls != [:open, :accept, :timed_out]" + end + .then("quit") do + tailing.quit + end + end + + it "files are read correctly" do + actions.activate_quietly + tailing.subscribe(observer) + actions.assert_no_errors + expect(listener2.lines).to eq([]) + expect(listener3.lines).to eq(["line2"]) + end + end + + context "when ignore_older is less than close_older and all files are not expired" do + let(:opts) { super().merge(:ignore_older => 1, :close_older => 1.1) } + let(:suffix) { "N" } + let(:start_new_files_at) { :beginning } + let(:actions) do + RSpec::Sequencing + .run_after(0.1, "file created") do + File.open(file_path, "wb") { |file| file.write("line1\nline2\n") } + end + .then("start watching before file age reaches ignore_older") do + tailing.watch_this(watch_dir) + end + .then("wait for lines") do + wait(1.5).for{listener1.calls}.to eq([:open, :accept, :accept, :timed_out]) + end + .then("quit") do + tailing.quit + end + end + + it "reads lines normally" do + actions.activate_quietly + tailing.subscribe(observer) + actions.assert_no_errors + expect(listener1.lines).to eq(["line1", "line2"]) + end + end + + context "when ignore_older is less than close_older and all files are expired" do + let(:opts) { super().merge(:ignore_older => 10, :close_older => 1) } + let(:suffix) { "P" } + let(:actions) do + RSpec::Sequencing + .run("creating file") do + File.open(file_path, "wb") { |file| file.write("line1\nline2\n") } + end + .then("making it older by 15 seconds and watch") do + FileWatch.make_file_older(file_path, 15) + tailing.watch_this(watch_dir) + end + .then_after(0.75, "quit after allowing time to check the files") do + tailing.quit + end + end + + it "no files are read" do + actions.activate_quietly + tailing.subscribe(observer) + expect(listener1.calls).to eq([]) + expect(listener1.lines).to eq([]) + end + end + + context "when ignore older and close older expiry is enabled and after timeout the file is appended-to" do + let(:opts) { super().merge(:ignore_older => 20, :close_older => 0.5) } + let(:suffix) { "Q" } + let(:actions) do + RSpec::Sequencing + .run("file older than ignore_older created and watching") do + File.open(file_path, "wb") { |file| file.write("line1\nline2\n") } + FileWatch.make_file_older(file_path, 25) + tailing.watch_this(watch_dir) + end + .then_after(0.15, "append more lines to file after file ages more than ignore_older") do + File.open(file_path, "ab") { |file| file.write("line3\nline4\n") } + end + .then("wait for lines") do + wait(2).for{listener1.calls}.to eq([:open, :accept, :accept, :timed_out]) + end + .then_after(0.1, "quit after allowing time to close the file") do + tailing.quit + end + end + + it "reads the added lines only" do + actions.activate_quietly + tailing.subscribe(observer) + actions.assert_no_errors + expect(listener1.lines).to eq(["line3", "line4"]) + end + end + + context "when a non default delimiter is specified and it is not in the content" do + let(:opts) { super().merge(:ignore_older => 20, :close_older => 1, :delimiter => "\nø") } + let(:suffix) { "R" } + let(:actions) do + RSpec::Sequencing + .run("start watching") do + tailing.watch_this(watch_dir) + end + .then("creating file") do + File.open(file_path, "wb") { |file| file.write("line1\nline2") } + end + .then("wait for :timeout") do + wait(2).for{listener1.calls}.to eq([:open, :timed_out]) + end + .then_after(0.75, "quit after allowing time to close the file") do + tailing.quit + end + end + + it "the file is opened, data is read, but no lines are found, the file times out" do + actions.activate_quietly + tailing.subscribe(observer) + actions.assert_no_errors + expect(listener1.lines).to eq([]) + sincedb_record_fields = File.read(sincedb_path).split(" ") + position_field_index = 3 + # tailing, no delimiter, we are expecting one, if it grows we read from the start. + # there is an info log telling us that no lines were seen but we can't test for it. + expect(sincedb_record_fields[position_field_index]).to eq("0") + end + end + end +end diff --git a/spec/filewatch/watched_file_spec.rb b/spec/filewatch/watched_file_spec.rb new file mode 100644 index 00000000..a639491c --- /dev/null +++ b/spec/filewatch/watched_file_spec.rb @@ -0,0 +1,69 @@ +# encoding: utf-8 +require 'stud/temporary' +require_relative 'spec_helper' + +module FileWatch + describe WatchedFile do + let(:pathname) { Pathname.new(__FILE__) } + + context 'Given two instances of the same file' do + it 'their sincedb_keys should equate' do + wf_key1 = WatchedFile.new(pathname, PathStatClass.new(pathname), Settings.new).sincedb_key + hash_db = { wf_key1 => 42 } + wf_key2 = WatchedFile.new(pathname, PathStatClass.new(pathname), Settings.new).sincedb_key + expect(wf_key1).to eq(wf_key2) + expect(wf_key1).to eql(wf_key2) + expect(wf_key1.hash).to eq(wf_key2.hash) + expect(hash_db[wf_key2]).to eq(42) + end + end + + context 'Given a barrage of state changes' do + it 'only the previous N state changes are remembered' do + watched_file = WatchedFile.new(pathname, PathStatClass.new(pathname), Settings.new) + watched_file.ignore + watched_file.watch + watched_file.activate + watched_file.watch + watched_file.close + watched_file.watch + watched_file.activate + watched_file.unwatch + watched_file.activate + watched_file.close + expect(watched_file.closed?).to be_truthy + expect(watched_file.recent_states).to eq([:watched, :active, :watched, :closed, :watched, :active, :unwatched, :active]) + end + end + + context 'restat' do + + let(:directory) { Stud::Temporary.directory } + let(:file_path) { ::File.join(directory, "restat.file.txt") } + let(:pathname) { Pathname.new(file_path) } + + before { FileUtils.touch file_path, :mtime => Time.now - 300 } + + it 'reports false value when no changes' do + file = WatchedFile.new(pathname, PathStatClass.new(pathname), Settings.new) + mtime = file.modified_at + expect( file.modified_at_changed? ).to be false + expect( file.restat! ).to be_falsy + expect( file.modified_at_changed? ).to be false + expect( file.modified_at ).to eql mtime + expect( file.modified_at(true) ).to eql mtime + end + + it 'reports truthy when changes detected' do + file = WatchedFile.new(pathname, PathStatClass.new(pathname), Settings.new) + mtime = file.modified_at + expect( file.modified_at_changed? ).to be false + FileUtils.touch file_path + expect( file.restat! ).to be_truthy + expect( file.modified_at_changed? ).to be true + expect( file.modified_at ).to eql mtime # until updated + expect( file.modified_at(true) ).to be > mtime + end + end + end +end diff --git a/spec/filewatch/watched_files_collection_spec.rb b/spec/filewatch/watched_files_collection_spec.rb new file mode 100644 index 00000000..d1b778e2 --- /dev/null +++ b/spec/filewatch/watched_files_collection_spec.rb @@ -0,0 +1,149 @@ +# encoding: utf-8 +require_relative 'spec_helper' + +module FileWatch + describe WatchedFilesCollection do + let(:time) { Time.now } + let(:filepath1) { "/var/log/z.log" } + let(:filepath2) { "/var/log/m.log" } + let(:filepath3) { "/var/log/a.log" } + let(:filepath4) { "/var/log/b.log" } + let(:stat1) { double("stat1", :size => 98, :modified_at => time - 30, :inode => 234567, :inode_struct => InodeStruct.new("234567", 3, 2)) } + let(:stat2) { double("stat2", :size => 99, :modified_at => time - 20, :inode => 234568, :inode_struct => InodeStruct.new("234568", 3, 2)) } + let(:stat3) { double("stat3", :size => 100, :modified_at => time, :inode => 234569, :inode_struct => InodeStruct.new("234569", 3, 2)) } + let(:stat4) { double("stat4", :size => 99, :modified_at => time, :inode => 234570, :inode_struct => InodeStruct.new("234570", 3, 2)) } + let(:wf1) { WatchedFile.new(filepath1, stat1, Settings.new) } + let(:wf2) { WatchedFile.new(filepath2, stat2, Settings.new) } + let(:wf3) { WatchedFile.new(filepath3, stat3, Settings.new) } + let(:wf4) { WatchedFile.new(filepath4, stat4, Settings.new) } + + context "sort by last_modified in ascending order" do + let(:sort_by) { "last_modified" } + let(:sort_direction) { "asc" } + + it "sorts earliest modified first" do + collection = described_class.new(Settings.from_options(:file_sort_by => sort_by, :file_sort_direction => sort_direction)) + expect(collection.empty?).to be true + collection.add(wf2) + expect(collection.empty?).to be false + expect(collection.values).to eq([wf2]) + collection.add(wf3) + expect(collection.values).to eq([wf2, wf3]) + collection.add(wf1) + expect(collection.values).to eq([wf1, wf2, wf3]) + expect(collection.keys.size).to eq 3 + end + + it "sorts by path when mtime is same" do + collection = described_class.new(Settings.from_options(:file_sort_by => sort_by, :file_sort_direction => sort_direction)) + expect(collection.size).to eq 0 + collection.add(wf2) + collection.add(wf4) + collection.add(wf1) + expect(collection.size).to eq 3 + expect(collection.values).to eq([wf1, wf2, wf4]) + collection.add(wf3) + expect(collection.size).to eq 4 + expect(collection.values).to eq([wf1, wf2, wf3, wf4]) + expect(collection.keys.size).to eq 4 + end + end + + context "sort by path in ascending order" do + let(:sort_by) { "path" } + let(:sort_direction) { "asc" } + + it "sorts path A-Z" do + collection = described_class.new(Settings.from_options(:file_sort_by => sort_by, :file_sort_direction => sort_direction)) + collection.add(wf2) + expect(collection.values).to eq([wf2]) + collection.add(wf1) + expect(collection.values).to eq([wf2, wf1]) + collection.add(wf3) + expect(collection.values).to eq([wf3, wf2, wf1]) + end + end + + context "sort by last_modified in descending order" do + let(:sort_by) { "last_modified" } + let(:sort_direction) { "desc" } + + it "sorts latest modified first" do + collection = described_class.new(Settings.from_options(:file_sort_by => sort_by, :file_sort_direction => sort_direction)) + collection.add(wf2) + expect(collection.values).to eq([wf2]) + collection.add(wf1) + expect(collection.values).to eq([wf2, wf1]) + collection.add(wf3) + expect(collection.values).to eq([wf3, wf2, wf1]) + end + end + + context "sort by path in descending order" do + let(:sort_by) { "path" } + let(:sort_direction) { "desc" } + + it "sorts path Z-A" do + collection = described_class.new(Settings.from_options(:file_sort_by => sort_by, :file_sort_direction => sort_direction)) + collection.add(wf2) + expect(collection.values).to eq([wf2]) + collection.add(wf1) + expect(collection.values).to eq([wf1, wf2]) + collection.add(wf3) + expect(collection.values).to eq([wf1, wf2, wf3]) + end + end + + context "remove_paths" do + let(:sort_by) { "path" } + let(:sort_direction) { "desc" } + + it "is able to delete multiple files at once" do + collection = described_class.new(Settings.from_options(:file_sort_by => sort_by, :file_sort_direction => sort_direction)) + collection.add(wf1) + collection.add(wf2) + collection.add(wf3) + expect(collection.keys).to eq([filepath1, filepath2, filepath3]) + + ret = collection.remove_paths([filepath2, filepath3]) + expect(ret).to eq 2 + expect(collection.keys).to eq([filepath1]) + expect(collection.values.size).to eq 1 + + ret = collection.remove_paths([filepath2]) + expect(ret).to eq 0 + end + end + + context "update" do + let(:sort_by) { "last_modified" } + let(:sort_direction) { "asc" } + + let(:re_stat1) { double("restat1", :size => 99, :modified_at => time, :inode => 234567, :inode_struct => InodeStruct.new("234567", 3, 2)) } + let(:re_stat2) { double("restat2", :size => 99, :modified_at => time, :inode => 234568, :inode_struct => InodeStruct.new("234568", 3, 2)) } + + it "updates entry with changed mtime" do + collection = described_class.new(Settings.from_options(:file_sort_by => sort_by, :file_sort_direction => sort_direction)) + collection.add(wf1) + collection.add(wf2) + collection.add(wf3) + expect(collection.files).to eq([wf1, wf2, wf3]) + + wf2.send(:set_stat, re_stat2) + expect( wf2.modified_at_changed? ).to be_truthy + + collection.update wf2 + expect(collection.files).to eq([wf1, wf3, wf2]) + + wf1.send(:set_stat, re_stat1) + expect( wf1.modified_at_changed? ).to be_truthy + collection.update wf1 + expect(collection.files).to eq([wf3, wf2, wf1]) + + collection.add(wf4) + expect(collection.files).to eq([wf3, wf4, wf2, wf1]) + end + end + + end +end diff --git a/spec/filewatch/winhelper_spec.rb b/spec/filewatch/winhelper_spec.rb new file mode 100644 index 00000000..a0d874c6 --- /dev/null +++ b/spec/filewatch/winhelper_spec.rb @@ -0,0 +1,22 @@ +# encoding: utf-8 +require "stud/temporary" +require "fileutils" + +if Gem.win_platform? + require "filewatch/winhelper" + + describe Winhelper do + let(:path) { Stud::Temporary.file.path } + + after do + FileUtils.rm_rf(path) + end + + it "return a unique file identifier" do + identifier = Winhelper.identifier_from_path(path) + + expect(identifier).not_to eq("unknown") + expect(identifier.count("-")).to eq(2) + end + end +end diff --git a/spec/fixtures/compressed.log.gz b/spec/fixtures/compressed.log.gz new file mode 100644 index 00000000..e8e0cb9e Binary files /dev/null and b/spec/fixtures/compressed.log.gz differ diff --git a/spec/fixtures/compressed.log.gzip b/spec/fixtures/compressed.log.gzip new file mode 100644 index 00000000..e8e0cb9e Binary files /dev/null and b/spec/fixtures/compressed.log.gzip differ diff --git a/spec/fixtures/invalid_utf8.gbk.log b/spec/fixtures/invalid_utf8.gbk.log new file mode 100644 index 00000000..372387bb --- /dev/null +++ b/spec/fixtures/invalid_utf8.gbk.log @@ -0,0 +1,2 @@ +2015-01-01T02:52:45.866722Z no "GET http://www.logstash.com:80/utfmadness/¡Ö4od HTTP/1.1" + diff --git a/spec/fixtures/no-final-newline.log b/spec/fixtures/no-final-newline.log new file mode 100644 index 00000000..07909300 --- /dev/null +++ b/spec/fixtures/no-final-newline.log @@ -0,0 +1,2 @@ +2010-03-12 23:51:20 SEA4 192.0.2.147 connect 2014 OK bfd8a98bee0840d9b871b7f6ade9908f rtmp://shqshne4jdp4b6.cloudfront.net/cfx/st​ key=value http://player.longtailvideo.com/player.swf http://www.longtailvideo.com/support/jw-player-setup-wizard?example=204 LNX%2010,0,32,18 - - - - +2010-03-12 23:51:21 SEA4 192.0.2.222 play 3914 OK bfd8a98bee0840d9b871b7f6ade9908f rtmp://shqshne4jdp4b6.cloudfront.net/cfx/st​ key=value http://player.longtailvideo.com/player.swf http://www.longtailvideo.com/support/jw-player-setup-wizard?example=204 LNX%2010,0,32,18 myvideo p=2&q=4 flv 1 \ No newline at end of file diff --git a/spec/fixtures/uncompressed.log b/spec/fixtures/uncompressed.log new file mode 100644 index 00000000..d827e02a --- /dev/null +++ b/spec/fixtures/uncompressed.log @@ -0,0 +1,2 @@ +2010-03-12 23:51:20 SEA4 192.0.2.147 connect 2014 OK bfd8a98bee0840d9b871b7f6ade9908f rtmp://shqshne4jdp4b6.cloudfront.net/cfx/st​ key=value http://player.longtailvideo.com/player.swf http://www.longtailvideo.com/support/jw-player-setup-wizard?example=204 LNX%2010,0,32,18 - - - - +2010-03-12 23:51:21 SEA4 192.0.2.222 play 3914 OK bfd8a98bee0840d9b871b7f6ade9908f rtmp://shqshne4jdp4b6.cloudfront.net/cfx/st​ key=value http://player.longtailvideo.com/player.swf http://www.longtailvideo.com/support/jw-player-setup-wizard?example=204 LNX%2010,0,32,18 myvideo p=2&q=4 flv 1 diff --git a/spec/helpers/logging_level_helper.rb b/spec/helpers/logging_level_helper.rb new file mode 100644 index 00000000..0ecc433c --- /dev/null +++ b/spec/helpers/logging_level_helper.rb @@ -0,0 +1,8 @@ +# encoding: utf-8 + +ENV["LOG_AT"].tap do |level| + if !level.nil? + LogStash::Logging::Logger::configure_logging(level) + LOG_AT_HANDLED = true + end +end diff --git a/spec/helpers/rspec_wait_handler_helper.rb b/spec/helpers/rspec_wait_handler_helper.rb new file mode 100644 index 00000000..2ad595b1 --- /dev/null +++ b/spec/helpers/rspec_wait_handler_helper.rb @@ -0,0 +1,38 @@ +# encoding: utf-8 + +module RSpec + module Wait + module Handler + def handle_matcher(target, *args, &block) + # there is a similar patch in the rspec-wait repo since Nov, 19 2017 + # it does not look like the author is interested in the change. + # - do not use Ruby Timeout + count = RSpec.configuration.wait_timeout.fdiv(RSpec.configuration.wait_delay).ceil + failure = nil + count.times do + begin + actual = target.respond_to?(:call) ? target.call : target + super(actual, *args, &block) + failure = nil + rescue RSpec::Expectations::ExpectationNotMetError => failure + sleep RSpec.configuration.wait_delay + end + break if failure.nil? + end + raise failure unless failure.nil? + end + end + + # From: https://github.com/rspec/rspec-expectations/blob/v3.0.0/lib/rspec/expectations/handler.rb#L44-L63 + class PositiveHandler < RSpec::Expectations::PositiveExpectationHandler + extend Handler + end + + # From: https://github.com/rspec/rspec-expectations/blob/v3.0.0/lib/rspec/expectations/handler.rb#L66-L93 + class NegativeHandler < RSpec::Expectations::NegativeExpectationHandler + extend Handler + end + end +end + +RSPEC_WAIT_HANDLER_PATCHED = true diff --git a/spec/spec_helper.rb b/spec/helpers/spec_helper.rb similarity index 51% rename from spec/spec_helper.rb rename to spec/helpers/spec_helper.rb index 850c275c..095b3275 100644 --- a/spec/spec_helper.rb +++ b/spec/helpers/spec_helper.rb @@ -1,44 +1,56 @@ # encoding: utf-8 require "logstash/devutils/rspec/spec_helper" +require "rspec/wait" require "rspec_sequencing" module FileInput + + FIXTURE_DIR = File.join('spec', 'fixtures') + def self.make_file_older(path, seconds) time = Time.now.to_f - seconds - File.utime(time, time, path) + ::File.utime(time, time, path) + end + + def self.make_fixture_current(path, time = Time.now) + ::File.utime(time, time, path) + end + + def self.corrupt_gzip(file_path) + f = File.open(file_path, "w") + f.seek(12) + f.puts 'corrupting_string' + f.close() end - + + def self.truncate_gzip(file_path) + f = File.open(file_path, "ab") + f.truncate(100) + f.close() + end + class TracerBase - def initialize() @tracer = []; end + def initialize + @tracer = Concurrent::Array.new + end def trace_for(symbol) params = @tracer.map {|k,v| k == symbol ? v : nil}.compact - params.empty? ? false : params + if params.empty? + false + else + # merge all params with same key + # there could be multiple instances of same call, e.g. [[:accept, true], [:auto_flush, true], [:close, true], [:auto_flush, true]] + params.reduce {|b1, b2| b1 and b2} + end end - def clear() - @tracer.clear() + def clear + @tracer.clear end end - class FileLogTracer < TracerBase - def warn(*args) @tracer.push [:warn, args]; end - def error(*args) @tracer.push [:error, args]; end - def debug(*args) @tracer.push [:debug, args]; end - def info(*args) @tracer.push [:info, args]; end - - def info?() true; end - def debug?() true; end - def warn?() true; end - def error?() true; end - end - - class ComponentTracer < TracerBase - def accept(*args) @tracer.push [:accept, args]; end - def deliver(*args) @tracer.push [:deliver, args]; end - end - class CodecTracer < TracerBase def decode_accept(ctx, data, listener) @tracer.push [:decode_accept, [ctx, data]] @@ -57,29 +69,13 @@ def close @tracer.push [:close, true] end def clone - self.class.new + self end end end -unless Kernel.method_defined?(:pause_until) - module Kernel - def pause_until(nap = 5, &block) - sq = SizedQueue.new(1) - th1 = Thread.new(sq) {|q| sleep nap; q.push(false) } - th2 = Thread.new(sq) do |q| - success = false - iters = nap * 5 + 1 - iters.times do - break if !!(success = block.call) - sleep(0.2) - end - q.push(success) - end - sq.pop - end - end -end +require_relative "rspec_wait_handler_helper" unless defined? RSPEC_WAIT_HANDLER_PATCHED +require_relative "logging_level_helper" unless defined? LOG_AT_HANDLED unless RSpec::Matchers.method_defined?(:receive_call_and_args) RSpec::Matchers.define(:receive_call_and_args) do |m, args| @@ -93,3 +89,6 @@ def pause_until(nap = 5, &block) end end +ENV["LOG_AT"].tap do |level| + LogStash::Logging::Logger::configure_logging(level) unless level.nil? +end diff --git a/spec/inputs/file_read_spec.rb b/spec/inputs/file_read_spec.rb new file mode 100644 index 00000000..ef820b96 --- /dev/null +++ b/spec/inputs/file_read_spec.rb @@ -0,0 +1,395 @@ +# encoding: utf-8 + +require "helpers/spec_helper" +require "logstash/inputs/file" + +# LogStash::Logging::Logger::configure_logging("DEBUG") + +require "tempfile" +require "stud/temporary" +require "logstash/codecs/multiline" + +describe LogStash::Inputs::File do + describe "'read' mode testing with input(conf) do |pipeline, queue|" do + it "should start at the beginning of an existing file and delete the file when done" do + directory = Stud::Temporary.directory + tmpfile_path = ::File.join(directory, "A.log") + sincedb_path = ::File.join(directory, "readmode_A_sincedb.txt") + path_path = ::File.join(directory, "*.log") + + conf = <<-CONFIG + input { + file { + id => "blah" + path => "#{path_path}" + sincedb_path => "#{sincedb_path}" + delimiter => "|" + mode => "read" + file_completed_action => "delete" + } + } + CONFIG + + File.open(tmpfile_path, "a") do |fd| + fd.write("hello|world") + fd.fsync + end + + events = input(conf) do |pipeline, queue| + wait(0.5).for{File.exist?(tmpfile_path)}.to be_falsey + 2.times.collect { queue.pop } + end + + expect(events.map{|e| e.get("message")}).to contain_exactly("hello", "world") + end + + it "should start at the beginning of an existing file and log the file when done" do + directory = Stud::Temporary.directory + tmpfile_path = ::File.join(directory, "A.log") + sincedb_path = ::File.join(directory, "readmode_A_sincedb.txt") + path_path = ::File.join(directory, "*.log") + log_completed_path = ::File.join(directory, "A_completed.txt") + + conf = <<-CONFIG + input { + file { + id => "blah" + path => "#{path_path}" + sincedb_path => "#{sincedb_path}" + delimiter => "|" + mode => "read" + file_completed_action => "log" + file_completed_log_path => "#{log_completed_path}" + } + } + CONFIG + + File.open(tmpfile_path, "a") do |fd| + fd.write("hello|world") + fd.fsync + end + + events = input(conf) do |pipeline, queue| + wait(0.75).for { IO.read(log_completed_path) }.to match(/A\.log/) + 2.times.collect { queue.pop } + end + expect(events.map{|e| e.get("message")}).to contain_exactly("hello", "world") + end + + it "should read whole file when exit_after_read is set to true" do + directory = Stud::Temporary.directory + tmpfile_path = ::File.join(directory, "B.log") + sincedb_path = ::File.join(directory, "readmode_B_sincedb.txt") + path_path = ::File.join(directory, "*.log") + + conf = <<-CONFIG + input { + file { + id => "foo" + path => "#{path_path}" + sincedb_path => "#{sincedb_path}" + delimiter => "|" + mode => "read" + file_completed_action => "delete" + exit_after_read => true + } + } + CONFIG + + File.open(tmpfile_path, "a") do |fd| + fd.write("exit|after|end") + fd.fsync + end + + events = input(conf) do |pipeline, queue| + wait(0.5).for{File.exist?(tmpfile_path)}.to be_falsey + 3.times.collect { queue.pop } + end + + expect(events.map{|e| e.get("message")}).to contain_exactly("exit", "after", "end") + end + + end + + describe "reading fixtures" do + let(:fixture_dir) { Pathname.new(FileInput::FIXTURE_DIR).expand_path } + + context "for a file without a final newline character" do + let(:file_path) { fixture_dir.join('no-final-newline.log') } + + it "the file is read and the path is logged to the `file_completed_log_path` file" do + tmpfile_path = fixture_dir.join("no-f*.log") + sincedb_path = Stud::Temporary.pathname + FileInput.make_fixture_current(file_path.to_path) + log_completed_path = Stud::Temporary.pathname + + conf = <<-CONFIG + input { + file { + type => "blah" + path => "#{tmpfile_path}" + sincedb_path => "#{sincedb_path}" + mode => "read" + file_completed_action => "log" + file_completed_log_path => "#{log_completed_path}" + } + } + CONFIG + + events = input(conf) do |pipeline, queue| + wait(0.75).for { IO.read(log_completed_path) }.to match(/#{file_path.to_s}/) + 2.times.collect { queue.pop } + end + + expect(events[0].get("message")).to start_with("2010-03-12 23:51") + expect(events[1].get("message")).to start_with("2010-03-12 23:51") + end + + end + + context "for an uncompressed file" do + let(:file_path) { fixture_dir.join('uncompressed.log') } + + it "the file is read and the path is logged to the `file_completed_log_path` file" do + FileInput.make_fixture_current(file_path.to_path) + tmpfile_path = fixture_dir.join("unc*.log") + directory = Stud::Temporary.directory + sincedb_path = ::File.join(directory, "readmode_B_sincedb.txt") + log_completed_path = ::File.join(directory, "B_completed.txt") + + conf = <<-CONFIG + input { + file { + type => "blah" + path => "#{tmpfile_path}" + sincedb_path => "#{sincedb_path}" + mode => "read" + file_completed_action => "log" + file_completed_log_path => "#{log_completed_path}" + } + } + CONFIG + + events = input(conf) do |pipeline, queue| + wait(0.75).for{ IO.read(log_completed_path) }.to match(/uncompressed\.log/) + 2.times.collect { queue.pop } + end + + expect(events[0].get("message")).to start_with("2010-03-12 23:51") + expect(events[1].get("message")).to start_with("2010-03-12 23:51") + end + end + + context "for a compressed file" do + let(:tmp_directory) { Stud::Temporary.directory } + let(:all_files_path) { fixture_dir.join("compressed.*.*") } + let(:gz_file_path) { fixture_dir.join('compressed.log.gz') } + let(:gzip_file_path) { fixture_dir.join('compressed.log.gzip') } + let(:sincedb_path) { ::File.join(tmp_directory, "sincedb.db") } + let(:log_completed_path) { ::File.join(tmp_directory, "completed.log") } + + it "the file is read" do + FileInput.make_fixture_current(gz_file_path.to_path) + FileInput.make_fixture_current(gzip_file_path.to_path) + + conf = <<-CONFIG + input { + file { + type => "blah" + path => "#{all_files_path}" + sincedb_path => "#{sincedb_path}" + mode => "read" + file_completed_action => "log" + file_completed_log_path => "#{log_completed_path}" + exit_after_read => true + } + } + CONFIG + + events = input(conf) do |pipeline, queue| + wait(0.75).for { IO.read(log_completed_path).scan(/compressed\.log\.gz(ip)?/).size }.to eq(2) + 4.times.collect { queue.pop } + end + + expect(events[0].get("message")).to start_with("2010-03-12 23:51") + expect(events[1].get("message")).to start_with("2010-03-12 23:51") + expect(events[2].get("message")).to start_with("2010-03-12 23:51") + expect(events[3].get("message")).to start_with("2010-03-12 23:51") + end + + it "the corrupted file is untouched" do + corrupted_file_path = ::File.join(tmp_directory, 'corrupted.gz') + FileUtils.cp(gz_file_path, corrupted_file_path) + + FileInput.corrupt_gzip(corrupted_file_path) + + conf = <<-CONFIG + input { + file { + type => "blah" + path => "#{corrupted_file_path}" + mode => "read" + file_completed_action => "log_and_delete" + file_completed_log_path => "#{log_completed_path}" + check_archive_validity => true + exit_after_read => true + } + } + CONFIG + + input(conf) do |pipeline, queue| + wait(1) + expect(IO.read(log_completed_path)).to be_empty + end + end + + it "the truncated file is untouched" do + truncated_file_path = ::File.join(tmp_directory, 'truncated.gz') + FileUtils.cp(gz_file_path, truncated_file_path) + + FileInput.truncate_gzip(truncated_file_path) + + conf = <<-CONFIG + input { + file { + type => "blah" + path => "#{truncated_file_path}" + mode => "read" + file_completed_action => "log_and_delete" + file_completed_log_path => "#{log_completed_path}" + check_archive_validity => true + exit_after_read => true + } + } + CONFIG + + input(conf) do |pipeline, queue| + wait(1) + expect(IO.read(log_completed_path)).to be_empty + end + end + end + end + + let(:temp_directory) { Stud::Temporary.directory } + let(:interval) { 0.1 } + let(:options) do + { + 'mode' => "read", + 'path' => "#{temp_directory}/*", + 'stat_interval' => interval, + 'discover_interval' => interval, + 'sincedb_path' => "#{temp_directory}/.sincedb", + 'sincedb_write_interval' => interval + } + end + + let(:queue) { Queue.new } + let(:plugin) { LogStash::Inputs::File.new(options) } + + describe 'delete on complete' do + + let(:options) do + super().merge({ 'file_completed_action' => "delete", 'exit_after_read' => false }) + end + + let(:sample_file) { File.join(temp_directory, "sample.log") } + + before do + plugin.register + @run_thread = Thread.new(plugin) do |plugin| + Thread.current.abort_on_exception = true + plugin.run queue + end + + File.open(sample_file, 'w') { |fd| fd.write("sample-content\n") } + + wait_for_start_processing(@run_thread) + end + + after { plugin.stop } + + it 'processes a file' do + wait_for_file_removal(sample_file) # watched discovery + + expect( plugin.queue.size ).to eql 1 + event = plugin.queue.pop + expect( event.get('message') ).to eql 'sample-content' + end + + it 'removes watched file from collection' do + wait_for_file_removal(sample_file) # watched discovery + sleep(0.25) # give CI some space to execute the removal + # TODO shouldn't be necessary once WatchedFileCollection does proper locking + watched_files = plugin.watcher.watch.watched_files_collection + expect( watched_files ).to be_empty + end + end + + describe 'sincedb cleanup' do + + let(:options) do + super().merge( + 'sincedb_path' => sincedb_path, + 'sincedb_clean_after' => '1.0 seconds', + 'sincedb_write_interval' => 0.25, + 'stat_interval' => 0.1, + ) + end + + let(:sincedb_path) { "#{temp_directory}/.sincedb" } + + let(:sample_file) { File.join(temp_directory, "sample.txt") } + + before do + plugin.register + @run_thread = Thread.new(plugin) do |plugin| + Thread.current.abort_on_exception = true + plugin.run queue + end + + File.open(sample_file, 'w') { |fd| fd.write("line1\nline2\n") } + + wait_for_start_processing(@run_thread) + end + + after { plugin.stop } + + it 'cleans up sincedb entry' do + wait_for_file_removal(sample_file) # watched discovery + + sincedb_content = File.read(sincedb_path).strip + expect( sincedb_content ).to_not be_empty + + try(3) do + sleep(1.5) # > sincedb_clean_after + + sincedb_content = File.read(sincedb_path).strip + expect( sincedb_content ).to be_empty + end + end + + end + + private + + def wait_for_start_processing(run_thread, timeout: 1.0) + begin + Timeout.timeout(timeout) do + sleep(0.01) while run_thread.status != 'sleep' + sleep(timeout) unless plugin.queue + end + rescue Timeout::Error + raise "plugin did not start processing (timeout: #{timeout})" unless plugin.queue + else + raise "plugin did not start processing" unless plugin.queue + end + end + + def wait_for_file_removal(path) + timeout = interval + try(5) do + wait(timeout).for { File.exist?(path) }.to be_falsey + end + end +end diff --git a/spec/inputs/file_spec.rb b/spec/inputs/file_spec.rb deleted file mode 100644 index fd196097..00000000 --- a/spec/inputs/file_spec.rb +++ /dev/null @@ -1,502 +0,0 @@ -# encoding: utf-8 - -require "logstash/inputs/file" -require_relative "../spec_helper" -require "tempfile" -require "stud/temporary" -require "logstash/codecs/multiline" - -FILE_DELIMITER = LogStash::Environment.windows? ? "\r\n" : "\n" - -describe LogStash::Inputs::File do - describe "testing with input(conf) do |pipeline, queue|" do - it_behaves_like "an interruptible input plugin" do - let(:config) do - { - "path" => Stud::Temporary.pathname, - "sincedb_path" => Stud::Temporary.pathname - } - end - end - - it "should start at the beginning of an existing file" do - tmpfile_path = Stud::Temporary.pathname - sincedb_path = Stud::Temporary.pathname - - conf = <<-CONFIG - input { - file { - type => "blah" - path => "#{tmpfile_path}" - start_position => "beginning" - sincedb_path => "#{sincedb_path}" - delimiter => "#{FILE_DELIMITER}" - } - } - CONFIG - - File.open(tmpfile_path, "a") do |fd| - fd.puts("hello") - fd.puts("world") - fd.fsync - end - - events = input(conf) do |pipeline, queue| - 2.times.collect { queue.pop } - end - - insist { events[0].get("message") } == "hello" - insist { events[1].get("message") } == "world" - end - - it "should restarts at the sincedb value" do - tmpfile_path = Stud::Temporary.pathname - sincedb_path = Stud::Temporary.pathname - - conf = <<-CONFIG - input { - file { - type => "blah" - path => "#{tmpfile_path}" - start_position => "beginning" - sincedb_path => "#{sincedb_path}" - delimiter => "#{FILE_DELIMITER}" - } - } - CONFIG - - File.open(tmpfile_path, "w") do |fd| - fd.puts("hello3") - fd.puts("world3") - end - - events = input(conf) do |pipeline, queue| - 2.times.collect { queue.pop } - end - - insist { events[0].get("message") } == "hello3" - insist { events[1].get("message") } == "world3" - - File.open(tmpfile_path, "a") do |fd| - fd.puts("foo") - fd.puts("bar") - fd.puts("baz") - fd.fsync - end - - events = input(conf) do |pipeline, queue| - 3.times.collect { queue.pop } - end - - insist { events[0].get("message") } == "foo" - insist { events[1].get("message") } == "bar" - insist { events[2].get("message") } == "baz" - end - - it "should not overwrite existing path and host fields" do - tmpfile_path = Stud::Temporary.pathname - sincedb_path = Stud::Temporary.pathname - - conf = <<-CONFIG - input { - file { - type => "blah" - path => "#{tmpfile_path}" - start_position => "beginning" - sincedb_path => "#{sincedb_path}" - delimiter => "#{FILE_DELIMITER}" - codec => "json" - } - } - CONFIG - - File.open(tmpfile_path, "w") do |fd| - fd.puts('{"path": "my_path", "host": "my_host"}') - fd.puts('{"my_field": "my_val"}') - fd.fsync - end - - events = input(conf) do |pipeline, queue| - 2.times.collect { queue.pop } - end - - insist { events[0].get("path") } == "my_path" - insist { events[0].get("host") } == "my_host" - - insist { events[1].get("path") } == "#{tmpfile_path}" - insist { events[1].get("host") } == "#{Socket.gethostname.force_encoding(Encoding::UTF_8)}" - end - - context "when sincedb_path is an existing directory" do - let(:tmpfile_path) { Stud::Temporary.pathname } - let(:sincedb_path) { Stud::Temporary.directory } - subject { LogStash::Inputs::File.new("path" => tmpfile_path, "sincedb_path" => sincedb_path) } - - after :each do - FileUtils.rm_rf(sincedb_path) - end - - it "should raise exception" do - expect { subject.register }.to raise_error(ArgumentError) - end - end - end - - describe "testing with new, register, run and stop" do - let(:conf) { Hash.new } - let(:mlconf) { Hash.new } - let(:events) { Array.new } - let(:mlcodec) { LogStash::Codecs::Multiline.new(mlconf) } - let(:codec) { FileInput::CodecTracer.new } - let(:tmpfile_path) { Stud::Temporary.pathname } - let(:sincedb_path) { Stud::Temporary.pathname } - let(:tmpdir_path) { Stud::Temporary.directory } - - after :each do - FileUtils.rm_rf(sincedb_path) - end - - context "when data exists and then more data is appended" do - subject { described_class.new(conf) } - - before do - File.open(tmpfile_path, "w") do |fd| - fd.puts("ignore me 1") - fd.puts("ignore me 2") - fd.fsync - end - mlconf.update("pattern" => "^\s", "what" => "previous") - conf.update("type" => "blah", - "path" => tmpfile_path, - "sincedb_path" => sincedb_path, - "stat_interval" => 0.1, - "codec" => mlcodec, - "delimiter" => FILE_DELIMITER) - subject.register - end - - it "reads the appended data only" do - RSpec::Sequencing - .run_after(0.1, "assert zero events then append two lines") do - expect(events.size).to eq(0) - File.open(tmpfile_path, "a") { |fd| fd.puts("hello"); fd.puts("world") } - end - .then_after(0.25, "quit") do - subject.stop - end - - subject.run(events) - - event1 = events[0] - expect(event1).not_to be_nil - expect(event1.get("path")).to eq tmpfile_path - expect(event1.get("[@metadata][path]")).to eq tmpfile_path - expect(event1.get("message")).to eq "hello" - - event2 = events[1] - expect(event2).not_to be_nil - expect(event2.get("path")).to eq tmpfile_path - expect(event2.get("[@metadata][path]")).to eq tmpfile_path - expect(event2.get("message")).to eq "world" - end - end - - context "when close_older config is specified" do - let(:line) { "line1.1-of-a" } - - subject { described_class.new(conf) } - - before do - conf.update( - "type" => "blah", - "path" => "#{tmpdir_path}/*.log", - "sincedb_path" => sincedb_path, - "stat_interval" => 0.02, - "codec" => codec, - "close_older" => 0.5, - "delimiter" => FILE_DELIMITER) - - subject.register - end - - it "having timed_out, the identity is evicted" do - RSpec::Sequencing - .run("create file") do - File.open("#{tmpdir_path}/a.log", "wb") { |file| file.puts(line) } - end - .then_after(0.3, "identity is mapped") do - expect(codec.trace_for(:accept)).to eq([true]) - expect(subject.codec.identity_count).to eq(1) - end - .then_after(0.3, "test for auto_flush") do - expect(codec.trace_for(:auto_flush)).to eq([true]) - expect(subject.codec.identity_count).to eq(0) - end - .then_after(0.1, "quit") do - subject.stop - end - subject.run(events) - end - end - - context "when ignore_older config is specified" do - let(:line) { "line1.1-of-a" } - let(:tmp_dir_file) { "#{tmpdir_path}/a.log" } - - subject { described_class.new(conf) } - - before do - File.open(tmp_dir_file, "a") do |fd| - fd.puts(line) - fd.fsync - end - FileInput.make_file_older(tmp_dir_file, 2) - conf.update( - "type" => "blah", - "path" => "#{tmpdir_path}/*.log", - "sincedb_path" => sincedb_path, - "stat_interval" => 0.02, - "codec" => codec, - "ignore_older" => 1, - "delimiter" => FILE_DELIMITER) - - subject.register - Thread.new { subject.run(events) } - end - - it "the file is not read" do - sleep 0.1 - subject.stop - expect(codec).to receive_call_and_args(:accept, false) - expect(codec).to receive_call_and_args(:auto_flush, false) - expect(subject.codec.identity_count).to eq(0) - end - end - - context "when wildcard path and a multiline codec is specified" do - subject { described_class.new(conf) } - - before do - mlconf.update("pattern" => "^\s", "what" => "previous") - conf.update( - "type" => "blah", - "path" => "#{tmpdir_path}/*.log", - "sincedb_path" => sincedb_path, - "stat_interval" => 0.05, - "codec" => mlcodec, - "delimiter" => FILE_DELIMITER) - - subject.register - end - - it "collects separate multiple line events from each file" do - actions = RSpec::Sequencing - .run_after(0.1, "create files") do - File.open("#{tmpdir_path}/A.log", "wb") do |fd| - fd.puts("line1.1-of-a") - fd.puts(" line1.2-of-a") - fd.puts(" line1.3-of-a") - end - File.open("#{tmpdir_path}/z.log", "wb") do |fd| - fd.puts("line1.1-of-z") - fd.puts(" line1.2-of-z") - fd.puts(" line1.3-of-z") - end - end - .then_after(0.2, "assert both files are mapped as identities and stop") do - expect(subject.codec.identity_count).to eq(2) - end - .then_after(0.1, "stop") do - subject.stop - end - .then_after(0.2 , "stop flushes both events") do - expect(events.size).to eq(2) - e1, e2 = events - e1_message = e1.get("message") - e2_message = e2.get("message") - - # can't assume File A will be read first - if e1_message.start_with?('line1.1-of-z') - expect(e1.get("path")).to match(/z.log/) - expect(e2.get("path")).to match(/A.log/) - expect(e1_message).to eq("line1.1-of-z#{FILE_DELIMITER} line1.2-of-z#{FILE_DELIMITER} line1.3-of-z") - expect(e2_message).to eq("line1.1-of-a#{FILE_DELIMITER} line1.2-of-a#{FILE_DELIMITER} line1.3-of-a") - else - expect(e1.get("path")).to match(/A.log/) - expect(e2.get("path")).to match(/z.log/) - expect(e1_message).to eq("line1.1-of-a#{FILE_DELIMITER} line1.2-of-a#{FILE_DELIMITER} line1.3-of-a") - expect(e2_message).to eq("line1.1-of-z#{FILE_DELIMITER} line1.2-of-z#{FILE_DELIMITER} line1.3-of-z") - end - end - subject.run(events) - # wait for actions to complete - actions.value - end - - context "if auto_flush is enabled on the multiline codec" do - let(:mlconf) { { "auto_flush_interval" => 0.5 } } - - it "an event is generated via auto_flush" do - actions = RSpec::Sequencing - .run_after(0.1, "create files") do - File.open("#{tmpdir_path}/A.log", "wb") do |fd| - fd.puts("line1.1-of-a") - fd.puts(" line1.2-of-a") - fd.puts(" line1.3-of-a") - end - end - .then_after(0.75, "wait for auto_flush") do - e1 = events.first - e1_message = e1.get("message") - expect(e1["path"]).to match(/a.log/) - expect(e1_message).to eq("line1.1-of-a#{FILE_DELIMITER} line1.2-of-a#{FILE_DELIMITER} line1.3-of-a") - end - .then("stop") do - subject.stop - end - subject.run(events) - # wait for actions to complete - actions.value - end - end - end - - context "when #run is called multiple times", :unix => true do - let(:file_path) { "#{tmpdir_path}/a.log" } - let(:buffer) { [] } - let(:lsof) { [] } - let(:run_thread_proc) do - lambda { Thread.new { subject.run(buffer) } } - end - let(:lsof_proc) do - lambda { `lsof -p #{Process.pid} | grep #{file_path}` } - end - - subject { described_class.new(conf) } - - before do - conf.update( - "path" => tmpdir_path + "/*.log", - "start_position" => "beginning", - "stat_interval" => 0.1, - "sincedb_path" => sincedb_path) - - File.open(file_path, "w") do |fd| - fd.puts('foo') - fd.puts('bar') - fd.fsync - end - end - - it "should only have one set of files open" do - subject.register - expect(lsof_proc.call).to eq("") - run_thread_proc.call - sleep 0.25 - first_lsof = lsof_proc.call - expect(first_lsof.scan(file_path).size).to eq(1) - run_thread_proc.call - sleep 0.25 - second_lsof = lsof_proc.call - expect(second_lsof.scan(file_path).size).to eq(1) - end - end - - describe "specifying max_open_files" do - subject { described_class.new(conf) } - before do - File.open("#{tmpdir_path}/a.log", "w") do |fd| - fd.puts("line1-of-a") - fd.puts("line2-of-a") - fd.fsync - end - File.open("#{tmpdir_path}/z.log", "w") do |fd| - fd.puts("line1-of-z") - fd.puts("line2-of-z") - fd.fsync - end - end - - context "when close_older is NOT specified" do - before do - conf.clear - conf.update( - "type" => "blah", - "path" => "#{tmpdir_path}/*.log", - "sincedb_path" => sincedb_path, - "stat_interval" => 0.1, - "max_open_files" => 1, - "start_position" => "beginning", - "delimiter" => FILE_DELIMITER) - subject.register - end - it "collects line events from only one file" do - actions = RSpec::Sequencing - .run_after(0.2, "assert one identity is mapped") do - expect(subject.codec.identity_count).to eq(1) - end - .then_after(0.1, "stop") do - subject.stop - end - .then_after(0.1, "stop flushes last event") do - expect(events.size).to eq(2) - e1, e2 = events - if Dir.glob("#{tmpdir_path}/*.log").first =~ %r{a\.log} - #linux and OSX have different retrieval order - expect(e1.get("message")).to eq("line1-of-a") - expect(e2.get("message")).to eq("line2-of-a") - else - expect(e1.get("message")).to eq("line1-of-z") - expect(e2.get("message")).to eq("line2-of-z") - end - end - subject.run(events) - # wait for actions future value - actions.value - end - end - - context "when close_older IS specified" do - before do - conf.update( - "type" => "blah", - "path" => "#{tmpdir_path}/*.log", - "sincedb_path" => sincedb_path, - "stat_interval" => 0.1, - "max_open_files" => 1, - "close_older" => 0.5, - "start_position" => "beginning", - "delimiter" => FILE_DELIMITER) - subject.register - end - - it "collects line events from both files" do - actions = RSpec::Sequencing - .run_after(0.2, "assert both identities are mapped and the first two events are built") do - expect(subject.codec.identity_count).to eq(2) - expect(events.size).to eq(2) - end - .then_after(0.8, "wait for close to flush last event of each identity") do - expect(events.size).to eq(4) - if Dir.glob("#{tmpdir_path}/*.log").first =~ %r{a\.log} - #linux and OSX have different retrieval order - e1, e2, e3, e4 = events - else - e3, e4, e1, e2 = events - end - expect(e1.get("message")).to eq("line1-of-a") - expect(e2.get("message")).to eq("line2-of-a") - expect(e3.get("message")).to eq("line1-of-z") - expect(e4.get("message")).to eq("line2-of-z") - end - .then_after(0.1, "stop") do - subject.stop - end - subject.run(events) - # wait for actions future value - actions.value - end - end - end - end -end diff --git a/spec/inputs/file_tail_spec.rb b/spec/inputs/file_tail_spec.rb new file mode 100644 index 00000000..8dae213f --- /dev/null +++ b/spec/inputs/file_tail_spec.rb @@ -0,0 +1,566 @@ +# encoding: utf-8 + +require "helpers/spec_helper" +require "logstash/devutils/rspec/shared_examples" +require "logstash/inputs/file" +require "logstash/plugin_mixins/ecs_compatibility_support/spec_helper" + +require "json" +require "tempfile" +require "stud/temporary" +require "logstash/codecs/multiline" + +# LogStash::Logging::Logger::configure_logging("DEBUG") + +TEST_FILE_DELIMITER = $/ + +describe LogStash::Inputs::File do + describe "'tail' mode testing with input(conf) do |pipeline, queue|" do + it_behaves_like "an interruptible input plugin" do + let(:config) do + { + "path" => Stud::Temporary.pathname, + "sincedb_path" => Stud::Temporary.pathname + } + end + end + + let(:directory) { Stud::Temporary.directory } + let(:sincedb_dir) { Stud::Temporary.directory } + let(:tmpfile_path) { ::File.join(directory, "#{name}.txt") } + let(:sincedb_path) { ::File.join(sincedb_dir, "readmode_#{name}_sincedb.txt") } + let(:path_path) { ::File.join(directory, "*.txt") } + + context "for an existing file" do + let(:name) { "A" } + it "should start at the beginning" do + conf = <<-CONFIG + input { + file { + type => "blah" + path => "#{path_path}" + start_position => "beginning" + sincedb_path => "#{sincedb_path}" + delimiter => "#{TEST_FILE_DELIMITER}" + } + } + CONFIG + + File.open(tmpfile_path, "a") do |fd| + fd.puts("hello") + fd.puts("world") + fd.fsync + end + + events = input(conf) do |pipeline, queue| + 2.times.collect { queue.pop } + end + expect(events.map{|e| e.get("message")}).to contain_exactly("hello", "world") + end + end + + context "running the input twice" do + let(:name) { "B" } + it "should restart at the sincedb value" do + conf = <<-CONFIG + input { + file { + type => "blah" + path => "#{path_path}" + start_position => "beginning" + sincedb_path => "#{sincedb_path}" + file_sort_by => "path" + delimiter => "#{TEST_FILE_DELIMITER}" + } + } + CONFIG + + File.open(tmpfile_path, "w") do |fd| + fd.puts("hello3") + fd.puts("world3") + end + + events = input(conf) do |pipeline, queue| + 2.times.collect { queue.pop } + end + + expect(events.map{|e| e.get("message")}).to contain_exactly("hello3", "world3") + + File.open(tmpfile_path, "a") do |fd| + fd.puts("foo") + fd.puts("bar") + fd.puts("baz") + fd.fsync + end + + events = input(conf) do |pipeline, queue| + 3.times.collect { queue.pop } + end + messages = events.map{|e| e.get("message")} + expect(messages).to contain_exactly("foo", "bar", "baz") + end + end + + + context "when path and host fields exist", :ecs_compatibility_support do + ecs_compatibility_matrix(:disabled, :v1, :v8 => :v1) do |ecs_select| + + before(:each) do + allow_any_instance_of(described_class).to receive(:ecs_compatibility).and_return(ecs_compatibility) + end + + let(:file_path_target_field ) { ecs_select[disabled: "path", v1: '[log][file][path]'] } + let(:source_host_target_field) { ecs_select[disabled: "host", v1: '[host][name]'] } + + let(:event_with_existing) do + LogStash::Event.new.tap do |e| + e.set(file_path_target_field, 'my_path') + e.set(source_host_target_field, 'my_host') + end.to_hash + end + + let(:name) { "C" } + it "should not overwrite them" do + conf = <<-CONFIG + input { + file { + type => "blah" + path => "#{path_path}" + start_position => "beginning" + sincedb_path => "#{sincedb_path}" + delimiter => "#{TEST_FILE_DELIMITER}" + codec => "json" + } + } + CONFIG + + File.open(tmpfile_path, "w") do |fd| + fd.puts(event_with_existing.to_json) + fd.puts('{"my_field": "my_val"}') + fd.fsync + end + + events = input(conf) do |pipeline, queue| + 2.times.collect { queue.pop } + end + + existing_path_index, added_path_index = "my_val" == events[0].get("my_field") ? [1,0] : [0,1] + + expect(events[existing_path_index].get(file_path_target_field)).to eq "my_path" + expect(events[existing_path_index].get(source_host_target_field)).to eq "my_host" + expect(events[existing_path_index].get("[@metadata][host]")).to eq "#{Socket.gethostname.force_encoding(Encoding::UTF_8)}" + + expect(events[added_path_index].get(file_path_target_field)).to eq "#{tmpfile_path}" + expect(events[added_path_index].get(source_host_target_field)).to eq "#{Socket.gethostname.force_encoding(Encoding::UTF_8)}" + expect(events[added_path_index].get("[@metadata][host]")).to eq "#{Socket.gethostname.force_encoding(Encoding::UTF_8)}" + end + end + end + + context "running the input twice", :ecs_compatibility_support do + ecs_compatibility_matrix(:disabled, :v1, :v8 => :v1) do |ecs_select| + + before(:each) do + allow_any_instance_of(described_class).to receive(:ecs_compatibility).and_return(ecs_compatibility) + end + + let(:file_path_target_field ) { ecs_select[disabled: "path", v1: '[log][file][path]'] } + let(:source_host_target_field) { ecs_select[disabled: "host", v1: '[host][name]'] } + + let(:name) { "D" } + it "should read old files" do + conf = <<-CONFIG + input { + file { + type => "blah" + path => "#{path_path}" + start_position => "beginning" + codec => "json" + } + } + CONFIG + + File.open(tmpfile_path, "w") do |fd| + fd.puts('{"path": "my_path", "host": "my_host"}') + fd.puts('{"my_field": "my_val"}') + fd.fsync + end + # arbitrary old file (2 days) + FileInput.make_file_older(tmpfile_path, 48 * 60 * 60) + + events = input(conf) do |pipeline, queue| + 2.times.collect { queue.pop } + end + existing_path_index, added_path_index = "my_val" == events[0].get("my_field") ? [1,0] : [0,1] + expect(events[existing_path_index].get("path")).to eq "my_path" + expect(events[existing_path_index].get("host")).to eq "my_host" + expect(events[existing_path_index].get("[@metadata][host]")).to eq "#{Socket.gethostname.force_encoding(Encoding::UTF_8)}" + + expect(events[added_path_index].get(file_path_target_field)).to eq "#{tmpfile_path}" + expect(events[added_path_index].get(source_host_target_field)).to eq "#{Socket.gethostname.force_encoding(Encoding::UTF_8)}" + expect(events[added_path_index].get("[@metadata][host]")).to eq "#{Socket.gethostname.force_encoding(Encoding::UTF_8)}" + end + end + end + + context "when sincedb_path is a directory" do + let(:name) { "E" } + subject { LogStash::Inputs::File.new("path" => path_path, "sincedb_path" => directory) } + + after :each do + FileUtils.rm_rf(sincedb_path) + end + + it "should raise exception" do + expect { subject.register }.to raise_error(ArgumentError) + end + end + + context "when mode it set to tail and exit_after_read equals true" do + subject { LogStash::Inputs::File.new("path" => path_path, "exit_after_read" => true, "mode" => "tail") } + + it "should raise exception" do + expect { subject.register }.to raise_error(ArgumentError) + end + end + + end + + describe "testing with new, register, run and stop" do + let(:suffix) { "A" } + let(:conf) { Hash.new } + let(:mlconf) { Hash.new } + let(:events) { Array.new } + let(:mlcodec) { LogStash::Codecs::Multiline.new(mlconf) } + let(:tracer_codec) { FileInput::CodecTracer.new } + let(:tmpdir_path) { Stud::Temporary.directory } + let(:tmpfile_path) { ::File.join(tmpdir_path, "#{suffix}.txt") } + let(:path_path) { ::File.join(tmpdir_path, "*.txt") } + let(:sincedb_path) { ::File.join(tmpdir_path, "sincedb-#{suffix}") } + + after :each do + sleep(0.1) until subject.completely_stopped? + FileUtils.rm_rf(sincedb_path) + end + + context "when data exists and then more data is appended", :ecs_compatibility_support do + ecs_compatibility_matrix(:disabled, :v1, :v8 => :v1) do |ecs_select| + + before(:each) do + allow_any_instance_of(described_class).to receive(:ecs_compatibility).and_return(ecs_compatibility) + end + + let(:file_path_target_field ) { ecs_select[disabled: "path", v1: '[log][file][path]'] } + subject { described_class.new(conf) } + + before do + File.open(tmpfile_path, "w") do |fd| + fd.puts("ignore me 1") + fd.puts("ignore me 2") + fd.fsync + end + mlconf.update("pattern" => "^\s", "what" => "previous") + conf.update("type" => "blah", + "path" => path_path, + "sincedb_path" => sincedb_path, + "stat_interval" => 0.1, + "codec" => mlcodec, + "delimiter" => TEST_FILE_DELIMITER) + end + + it "reads the appended data only" do + subject.register + actions = RSpec::Sequencing + .run_after(1, "append two lines after delay") do + File.open(tmpfile_path, "a") { |fd| fd.puts("hello"); fd.puts("world") } + end + .then("wait for one event") do + wait(0.75).for{events.size}.to eq(1) + end + .then("quit") do + subject.stop + end + .then("wait for flushed event") do + wait(0.75).for{events.size}.to eq(2) + end + + subject.run(events) + actions.assert_no_errors + + event1 = events[0] + expect(event1).not_to be_nil + expect(event1.get(file_path_target_field)).to eq tmpfile_path + expect(event1.get("[@metadata][path]")).to eq tmpfile_path + expect(event1.get("message")).to eq "hello" + + event2 = events[1] + expect(event2).not_to be_nil + expect(event2.get(file_path_target_field)).to eq tmpfile_path + expect(event2.get("[@metadata][path]")).to eq tmpfile_path + expect(event2.get("message")).to eq "world" + end + end + end + + context "when close_older config is specified" do + let(:line) { "line1.1-of-a" } + let(:suffix) { "X" } + subject { described_class.new(conf) } + + before do + conf.update( + "type" => "blah", + "path" => path_path, + "sincedb_path" => sincedb_path, + "stat_interval" => 0.02, + "codec" => tracer_codec, + "close_older" => "100 ms", + "start_position" => "beginning", + "delimiter" => TEST_FILE_DELIMITER) + + subject.register + end + + it "having timed_out, the codec is auto flushed" do + actions = RSpec::Sequencing + .run("create file") do + File.open(tmpfile_path, "wb") { |file| file.puts(line) } + end + .then_after(0.1, "identity is mapped") do + wait(0.75).for{subject.codec.identity_map[tmpfile_path]}.not_to be_nil, "identity is not mapped" + end + .then("wait accept") do + wait(0.75).for { + subject.codec.identity_map[tmpfile_path].codec.trace_for(:accept) + }.to eq(true), "accept didn't" + end + .then("request a stop") do + # without this the subject.run doesn't invokes the #exit_flush which is the only @codec.flush_mapped invocation + subject.stop + end + .then("wait for auto_flush") do + wait(2).for { + subject.codec.identity_map[tmpfile_path].codec.trace_for(:auto_flush) + }.to eq(true), "autoflush didn't" + end + subject.run(events) + actions.assert_no_errors + expect(subject.codec.identity_map[tmpfile_path].codec.trace_for(:accept)).to eq(true) + end + end + + context "when ignore_older config is specified" do + let(:suffix) { "Y" } + before do + conf.update( + "type" => "blah", + "path" => path_path, + "sincedb_path" => sincedb_path, + "stat_interval" => 0.02, + "codec" => tracer_codec, + "ignore_older" => "500 ms", + "delimiter" => TEST_FILE_DELIMITER) + end + subject { described_class.new(conf) } + let(:line) { "line1.1-of-a" } + + it "the file is not read" do + subject.register + RSpec::Sequencing + .run("create file") do + File.open(tmp_dir_file, "a") do |fd| + fd.puts(line) + fd.fsync + end + FileInput.make_file_older(tmp_dir_file, 2) + end + .then_after(0.5, "stop") do + subject.stop + end + subject.run(events) + expect(subject.codec.identity_map[tmpfile_path].codec.trace_for(:accept)).to be_falsey + end + end + + context "when wildcard path and a multiline codec is specified", :ecs_compatibility_support do + ecs_compatibility_matrix(:disabled, :v1, :v8 => :v1) do |ecs_select| + + before(:each) do + allow_any_instance_of(described_class).to receive(:ecs_compatibility).and_return(ecs_compatibility) + end + + let(:file_path_target_field ) { ecs_select[disabled: "path", v1: '[log][file][path]'] } + + subject { described_class.new(conf) } + let(:suffix) { "J" } + let(:tmpfile_path2) { ::File.join(tmpdir_path, "K.txt") } + before do + mlconf.update("pattern" => "^\s", "what" => "previous") + conf.update( + "type" => "blah", + "path" => path_path, + "start_position" => "beginning", + "sincedb_path" => sincedb_path, + "stat_interval" => 0.05, + "codec" => mlcodec, + "file_sort_by" => "path", + "delimiter" => TEST_FILE_DELIMITER) + + subject.register + end + + it "collects separate multiple line events from each file" do + subject + actions = RSpec::Sequencing + .run_after(0.1, "create files") do + File.open(tmpfile_path, "wb") do |fd| + fd.puts("line1.1-of-J") + fd.puts(" line1.2-of-J") + fd.puts(" line1.3-of-J") + end + File.open(tmpfile_path2, "wb") do |fd| + fd.puts("line1.1-of-K") + fd.puts(" line1.2-of-K") + fd.puts(" line1.3-of-K") + end + end + .then("assert both files are mapped as identities and stop") do + wait(2).for {subject.codec.identity_count}.to eq(2), "both files are not mapped as identities" + end + .then("stop") do + subject.stop + end + subject.run(events) + # wait for actions to complete + actions.assert_no_errors + expect(events.size).to eq(2) + e1, e2 = events + e1_message = e1.get("message") + e2_message = e2.get("message") + + expect(e1.get(file_path_target_field)).to match(/J.txt/) + expect(e2.get(file_path_target_field)).to match(/K.txt/) + expect(e1_message).to eq("line1.1-of-J#{TEST_FILE_DELIMITER} line1.2-of-J#{TEST_FILE_DELIMITER} line1.3-of-J") + expect(e2_message).to eq("line1.1-of-K#{TEST_FILE_DELIMITER} line1.2-of-K#{TEST_FILE_DELIMITER} line1.3-of-K") + end + + context "if auto_flush is enabled on the multiline codec" do + let(:mlconf) { { "auto_flush_interval" => 0.5 } } + let(:suffix) { "M" } + it "an event is generated via auto_flush" do + actions = RSpec::Sequencing + .run_after(0.1, "create files") do + File.open(tmpfile_path, "wb") do |fd| + fd.puts("line1.1-of-a") + fd.puts(" line1.2-of-a") + fd.puts(" line1.3-of-a") + end + end + .then("wait for auto_flush") do + wait(2).for{events.size}.to eq(1), "events size is not 1" + end + .then("stop") do + subject.stop + end + subject.run(events) + # wait for actions to complete + actions.assert_no_errors + e1 = events.first + e1_message = e1.get("message") + expect(e1_message).to eq("line1.1-of-a#{TEST_FILE_DELIMITER} line1.2-of-a#{TEST_FILE_DELIMITER} line1.3-of-a") + expect(e1.get(file_path_target_field)).to match(/M.txt$/) + end + end + end + end + + describe "specifying max_open_files" do + let(:suffix) { "P" } + let(:tmpfile_path2) { ::File.join(tmpdir_path, "Q.txt") } + subject { described_class.new(conf) } + before do + File.open(tmpfile_path, "w") do |fd| + fd.puts("line1-of-P") + fd.puts("line2-of-P") + fd.fsync + end + File.open(tmpfile_path2, "w") do |fd| + fd.puts("line1-of-Q") + fd.puts("line2-of-Q") + fd.fsync + end + end + + context "when close_older is NOT specified" do + before do + conf.clear + conf.update( + "type" => "blah", + "path" => path_path, + "sincedb_path" => sincedb_path, + "stat_interval" => 0.1, + "max_open_files" => 1, + "start_position" => "beginning", + "file_sort_by" => "path", + "delimiter" => TEST_FILE_DELIMITER) + subject.register + end + it "collects line events from only one file" do + actions = RSpec::Sequencing + .run("assert one identity is mapped") do + wait(0.4).for{subject.codec.identity_count}.to be > 0, "no identity is mapped" + end + .then("stop") do + subject.stop + end + .then("stop flushes last event") do + wait(0.4).for{events.size}.to eq(2), "events size does not equal 2" + end + subject.run(events) + # wait for actions future value + actions.assert_no_errors + e1, e2 = events + expect(e1.get("message")).to eq("line1-of-P") + expect(e2.get("message")).to eq("line2-of-P") + end + end + + context "when close_older IS specified" do + before do + conf.update( + "type" => "blah", + "path" => path_path, + "sincedb_path" => sincedb_path, + "stat_interval" => 0.1, + "max_open_files" => 1, + "close_older" => 0.5, + "start_position" => "beginning", + "file_sort_by" => "path", + "delimiter" => TEST_FILE_DELIMITER) + subject.register + end + + it "collects line events from both files" do + actions = RSpec::Sequencing + .run("assert both identities are mapped and the first two events are built") do + wait(0.4).for{subject.codec.identity_count == 1 && events.size == 2}.to eq(true), "both identities are not mapped and the first two events are not built" + end + .then("wait for close to flush last event of each identity") do + wait(0.8).for{events.size}.to eq(4), "close does not flush last event of each identity" + end + .then_after(0.1, "stop") do + subject.stop + end + subject.run(events) + # wait for actions future value + actions.assert_no_errors + e1, e2, e3, e4 = events + expect(e1.get("message")).to eq("line1-of-P") + expect(e2.get("message")).to eq("line2-of-P") + expect(e3.get("message")).to eq("line1-of-Q") + expect(e4.get("message")).to eq("line2-of-Q") + end + end + end + end +end diff --git a/spec/inputs/friendly_durations_spec.rb b/spec/inputs/friendly_durations_spec.rb new file mode 100644 index 00000000..586bd865 --- /dev/null +++ b/spec/inputs/friendly_durations_spec.rb @@ -0,0 +1,71 @@ +# encoding: utf-8 + +require "helpers/spec_helper" +require "logstash/inputs/friendly_durations" + +describe "FriendlyDurations module function call" do + context "unacceptable strings" do + it "gives an error message for 'foobar'" do + result = LogStash::Inputs::FriendlyDurations.call("foobar","sec") + expect(result.error_message).to start_with("Value 'foobar' is not a valid duration string e.g. 200 usec") + end + it "gives an error message for '5 5 days'" do + result = LogStash::Inputs::FriendlyDurations.call("5 5 days","sec") + expect(result.error_message).to start_with("Value '5 5 days' is not a valid duration string e.g. 200 usec") + end + end + + context "when a unit is not specified, a unit override will affect the result" do + it "coerces 14 to 1209600.0s as days" do + result = LogStash::Inputs::FriendlyDurations.call(14,"d") + expect(result.error_message).to eq(nil) + expect(result.value).to eq(1209600.0) + end + it "coerces '30' to 1800.0s as minutes" do + result = LogStash::Inputs::FriendlyDurations.call("30","minutes") + expect(result.to_a).to eq([true, 1800.0]) + end + end + + context "acceptable strings" do + [ + ["10", 10.0], + ["10.5 s", 10.5], + ["10.75 secs", 10.75], + ["11 second", 11.0], + ["10 seconds", 10.0], + ["500 ms", 0.5], + ["750.9 msec", 0.7509], + ["750.9 msecs", 0.7509], + ["750.9 us", 0.0007509], + ["750.9 usec", 0.0007509], + ["750.9 usecs", 0.0007509], + ["1.5m", 90.0], + ["2.5 m", 150.0], + ["1.25 min", 75.0], + ["1 minute", 60.0], + ["2.5 minutes", 150.0], + ["2h", 7200.0], + ["2 h", 7200.0], + ["1 hour", 3600.0], + ["1hour", 3600.0], + ["3 hours", 10800.0], + ["0.5d", 43200.0], + ["1day", 86400.0], + ["1 day", 86400.0], + ["2days", 172800.0], + ["14 days", 1209600.0], + ["1w", 604800.0], + ["1 w", 604800.0], + ["1 week", 604800.0], + ["2weeks", 1209600.0], + ["2 weeks", 1209600.0], + ["1.5 weeks", 907200.0], + ].each do |input, coerced| + it "coerces #{input.inspect.rjust(16)} to #{coerced.inspect}" do + result = LogStash::Inputs::FriendlyDurations.call(input,"sec") + expect(result.to_a).to eq([true, coerced]) + end + end + end +end diff --git a/src/main/java/JrubyFileWatchService.java b/src/main/java/JrubyFileWatchService.java new file mode 100644 index 00000000..969147c1 --- /dev/null +++ b/src/main/java/JrubyFileWatchService.java @@ -0,0 +1,11 @@ +import org.jruby.Ruby; +import org.jruby.runtime.load.BasicLibraryService; +import org.logstash.filewatch.JrubyFileWatchLibrary; + +public class JrubyFileWatchService implements BasicLibraryService { + @Override + public final boolean basicLoad(final Ruby runtime) { + new JrubyFileWatchLibrary().load(runtime, false); + return true; + } +} diff --git a/src/main/java/jnr/posix/windows/WindowsFileInformationByHandle.java b/src/main/java/jnr/posix/windows/WindowsFileInformationByHandle.java new file mode 100644 index 00000000..7e36502d --- /dev/null +++ b/src/main/java/jnr/posix/windows/WindowsFileInformationByHandle.java @@ -0,0 +1,24 @@ +package jnr.posix.windows; +/* +This, sadly can't be used. +See JrubyFileWatchLibrary class +The jnr jar is loaded by a different class loader than our jar (in rspec anyway) +Even though the package is the same, Java restricts access to `dwVolumeSerialNumber` in the super class +We have to continue to use FFI in Ruby. +*/ + +public class WindowsFileInformationByHandle extends WindowsByHandleFileInformation { + public WindowsFileInformationByHandle(jnr.ffi.Runtime runtime) { + super(runtime); + } + + public java.lang.String getIdentifier() { + StringBuilder builder = new StringBuilder(); + builder.append(dwVolumeSerialNumber.intValue()); + builder.append("-"); + builder.append(nFileIndexHigh.intValue()); + builder.append("-"); + builder.append(nFileIndexLow.intValue()); + return builder.toString(); + } +} diff --git a/src/main/java/org/logstash/filewatch/JrubyFileWatchLibrary.java b/src/main/java/org/logstash/filewatch/JrubyFileWatchLibrary.java new file mode 100644 index 00000000..ab344cd9 --- /dev/null +++ b/src/main/java/org/logstash/filewatch/JrubyFileWatchLibrary.java @@ -0,0 +1,304 @@ +package org.logstash.filewatch; + +/** + * Created with IntelliJ IDEA. User: efrey Date: 6/11/13 Time: 11:00 AM To + * change this template use File | Settings | File Templates. + * + * http://bugs.sun.com/view_bug.do?bug_id=6357433 + * [Guy] modified original to be a proper JRuby class + * [Guy] do we need this anymore? JRuby 1.7+ uses new Java 7 File API + * + * + * fnv code extracted and modified from https://github.com/jakedouglas/fnv-java + */ + +import jnr.ffi.Runtime; +import jnr.posix.HANDLE; +import jnr.posix.JavaLibCHelper; +import jnr.posix.POSIX; +import jnr.posix.WindowsLibC; +import jnr.posix.WindowsPOSIX; +import jnr.posix.util.WindowsHelpers; +import jnr.posix.windows.WindowsFileInformationByHandle; +import org.jruby.Ruby; +import org.jruby.RubyBignum; +import org.jruby.RubyClass; +import org.jruby.RubyFixnum; +import org.jruby.RubyIO; +import org.jruby.RubyModule; +import org.jruby.RubyNumeric; +import org.jruby.RubyObject; +import org.jruby.RubyString; +import org.jruby.anno.JRubyClass; +import org.jruby.anno.JRubyMethod; +import org.jruby.ext.ffi.Factory; +import org.jruby.ext.ffi.MemoryIO; +import org.jruby.ext.ffi.Pointer; +import org.jruby.runtime.Arity; +import org.jruby.runtime.Block; +import org.jruby.runtime.ThreadContext; +import org.jruby.runtime.builtin.IRubyObject; +import org.jruby.runtime.load.Library; +import org.jruby.util.io.OpenFile; + +import java.io.IOException; +import java.math.BigInteger; +import java.nio.channels.Channel; +import java.nio.channels.FileChannel; +import java.nio.file.FileSystems; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; + +@SuppressWarnings("ClassUnconnectedToPackage") +public class JrubyFileWatchLibrary implements Library { + + private static final BigInteger INIT32 = new BigInteger("811c9dc5", 16); + private static final BigInteger INIT64 = new BigInteger("cbf29ce484222325", 16); + private static final BigInteger PRIME32 = new BigInteger("01000193", 16); + private static final BigInteger PRIME64 = new BigInteger("100000001b3", 16); + private static final BigInteger MOD32 = new BigInteger("2").pow(32); + private static final BigInteger MOD64 = new BigInteger("2").pow(64); + + // private static final int GENERIC_ALL = 268435456; + private static final int GENERIC_READ = -2147483648; + // private static final int GENERIC_WRITE = 1073741824; + // private static final int GENERIC_EXECUTE = 33554432; + // private static final int FILE_SHARE_DELETE = 4; + private static final int FILE_SHARE_READ = 1; + private static final int FILE_SHARE_WRITE = 2; + // private static final int CREATE_ALWAYS = 2; + // private static final int CREATE_NEW = 1; + // private static final int OPEN_ALWAYS = 4; + private static final int OPEN_EXISTING = 3; + // private static final int TRUNCATE_EXISTING = 5; + private static final int FILE_FLAG_BACKUP_SEMANTICS = 33554432; + // private static final int FILE_ATTRIBUTE_READONLY = 1; + + @Override + public final void load(final Ruby runtime, final boolean wrap) { + final RubyModule module = runtime.defineModule("FileWatch"); + + RubyClass clazz = runtime.defineClassUnder("FileExt", runtime.getObject(), JrubyFileWatchLibrary.RubyFileExt::new, module); + clazz.defineAnnotatedMethods(JrubyFileWatchLibrary.RubyFileExt.class); + + clazz = runtime.defineClassUnder("Fnv", runtime.getObject(), JrubyFileWatchLibrary.Fnv::new, module); + clazz.defineAnnotatedMethods(JrubyFileWatchLibrary.Fnv.class); + + WatchedFilesCollection.load(runtime); + } + + @JRubyClass(name = "FileExt") + public static class RubyFileExt extends RubyObject { + + public RubyFileExt(final Ruby runtime, final RubyClass meta) { + super(runtime, meta); + } + + public RubyFileExt(final RubyClass meta) { + super(meta); + } + + @JRubyMethod(name = "open", required = 1, meta = true) + public static IRubyObject open(final ThreadContext context, final IRubyObject self, final RubyString path) throws IOException { + final Path javapath = FileSystems.getDefault().getPath(path.asJavaString()); + final Channel channel = FileChannel.open(javapath, StandardOpenOption.READ); + final RubyIO irubyobject = new RubyWinIO(context.runtime, channel); + return irubyobject; + } + + @JRubyMethod(name = "io_handle", required = 1, meta = true) + public static IRubyObject ioHandle(final ThreadContext context, final IRubyObject self, final IRubyObject object, Block block) { + final Ruby runtime = context.runtime; + if (!block.isGiven()) { + throw runtime.newArgumentError(0, 1); + } + if (object instanceof RubyWinIO) { + final RubyWinIO rubyWinIO = (RubyWinIO) object; + final OpenFile fptr = rubyWinIO.getOpenFileChecked(); + final boolean locked = fptr.lock(); + try { + fptr.checkClosed(); + if (rubyWinIO.isDirect()) { + final MemoryIO memoryio = Factory.getInstance().wrapDirectMemory(runtime, rubyWinIO.getAddress()); + final Pointer pointer = new Pointer(runtime, memoryio); + return block.yield(context, pointer); + } + } finally { + if (locked) { + fptr.unlock(); + } + } + } else { + System.out.println("Required argument is not a WinIO instance"); + } + return runtime.newString(); + } + + //@JRubyMethod(name = "io_inode", required = 1, meta = true) + public static RubyString ioInode(final ThreadContext context, final IRubyObject self, final IRubyObject object) { + final Ruby runtime = context.runtime; + if (!(object instanceof RubyIO)) { + System.out.println("Required argument is not an IO instance"); + return runtime.newString(); + } + final RubyIO rubyIO = (RubyIO) object; + final OpenFile fptr = rubyIO.getOpenFileChecked(); + final boolean locked = fptr.lock(); + String inode = ""; + try { + fptr.checkClosed(); + final POSIX posix = runtime.getPosix(); + final int realFileno = fptr.fd().realFileno; + if (posix.isNative() && posix instanceof WindowsPOSIX && realFileno != -1) { + final WindowsPOSIX winposix = (WindowsPOSIX) posix; + final WindowsLibC wlibc = (WindowsLibC) winposix.libc(); + final WindowsFileInformationByHandle info = new WindowsFileInformationByHandle(Runtime.getRuntime(runtime.getPosix().libc())); + final HANDLE handle = JavaLibCHelper.gethandle(JavaLibCHelper.getDescriptorFromChannel(fptr.fd().chFile)); + if (handle.isValid()) { + if (wlibc.GetFileInformationByHandle(handle, info) > 0) { + inode = info.getIdentifier(); + } else { + System.out.println("Could not 'GetFileInformationByHandle' from handle"); + } + } else { + System.out.println("Could not derive 'HANDLE' from Ruby IO instance via io.getOpenFileChecked().fd().chFile"); + } + } + } finally { + if (locked) { + fptr.unlock(); + } + } + return runtime.newString(inode); + } + + //@JRubyMethod(name = "path_inode", required = 1, meta = true) + public static RubyString pathInode(final ThreadContext context, final IRubyObject self, final RubyString path) { + final Ruby runtime = context.runtime; + final POSIX posix = runtime.getPosix(); + String inode = ""; + if (posix.isNative() && posix.libc() instanceof WindowsLibC) { + final WindowsLibC wlibc = (WindowsLibC) posix.libc(); + final byte[] wpath = WindowsHelpers.toWPath(path.toString()); + final HANDLE handle = wlibc.CreateFileW(wpath, GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_WRITE, null, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, 0); + if (handle.isValid()) { + final WindowsFileInformationByHandle info = new WindowsFileInformationByHandle(Runtime.getRuntime(runtime.getPosix().libc())); + if (wlibc.GetFileInformationByHandle(handle, info) > 0) { + inode = info.getIdentifier(); + } else { + System.out.println("Could not 'GetFileInformationByHandle' from handle"); + } + wlibc.CloseHandle(handle); + } else { + System.out.printf("Could not open file via 'CreateFileW' on path: %s", path.toString()); + } + } + return runtime.newString(inode); + } + } + + // This class may be used by fingerprinting in the future + @SuppressWarnings({"NewMethodNamingConvention", "ChainOfInstanceofChecks"}) + @JRubyClass(name = "Fnv") + public static class Fnv extends RubyObject { + + private byte[] bytes; + private long size; + private boolean open; + + public Fnv(final Ruby runtime, final RubyClass metaClass) { + super(runtime, metaClass); + } + + public Fnv(final RubyClass metaClass) { + super(metaClass); + } + + @JRubyMethod(name = "coerce_bignum", meta = true, required = 1) + public static IRubyObject coerceBignum(final ThreadContext ctx, final IRubyObject recv, final IRubyObject rubyObject) { + if (rubyObject instanceof RubyBignum) { + return rubyObject; + } + if (rubyObject instanceof RubyFixnum) { + return RubyBignum.newBignum(ctx.runtime, ((RubyNumeric) rubyObject).getBigIntegerValue()); + } + throw ctx.runtime.newRaiseException(ctx.runtime.getClass("StandardError"), "Can't coerce"); + } + + // def initialize(data) + @JRubyMethod(name = "initialize", required = 1) + public IRubyObject rubyInitialize(final ThreadContext ctx, final RubyString data) { + bytes = data.getBytes(); + size = (long) bytes.length; + open = true; + return ctx.nil; + } + + @JRubyMethod(name = "close") + public IRubyObject close(final ThreadContext ctx) { + open = false; + bytes = null; + return ctx.nil; + } + + @JRubyMethod(name = "open?") + public IRubyObject open_p(final ThreadContext ctx) { + if(open) { + return ctx.runtime.getTrue(); + } + return ctx.runtime.getFalse(); + } + + @JRubyMethod(name = "closed?") + public IRubyObject closed_p(final ThreadContext ctx) { + if(open) { + return ctx.runtime.getFalse(); + } + return ctx.runtime.getTrue(); + } + + @JRubyMethod(name = "fnv1a32", optional = 1) + public IRubyObject fnv1a_32(final ThreadContext ctx, final IRubyObject[] args) { + IRubyObject[] args1 = args; + if(open) { + args1 = Arity.scanArgs(ctx.runtime, args1, 0, 1); + return RubyBignum.newBignum(ctx.runtime, common_fnv(args1[0], INIT32, PRIME32, MOD32)); + } + throw ctx.runtime.newRaiseException(ctx.runtime.getClass("StandardError"), "Fnv instance is closed!"); + } + + @JRubyMethod(name = "fnv1a64", optional = 1) + public IRubyObject fnv1a_64(final ThreadContext ctx, final IRubyObject[] args) { + IRubyObject[] args1 = args; + if(open) { + args1 = Arity.scanArgs(ctx.runtime, args1, 0, 1); + return RubyBignum.newBignum(ctx.runtime, common_fnv(args1[0], INIT64, PRIME64, MOD64)); + } + throw ctx.runtime.newRaiseException(ctx.runtime.getClass("StandardError"), "Fnv instance is closed!"); + } + + private long convertLong(final IRubyObject obj) { + if(obj instanceof RubyNumeric) { + return ((RubyNumeric) obj).getLongValue(); + } + return size; + } + + private BigInteger common_fnv(final IRubyObject len, final BigInteger hash, final BigInteger prime, final BigInteger mod) { + long converted = convertLong(len); + + if (converted > size) { + converted = size; + } + + BigInteger tempHash = hash; + for (int idx = 0; (long) idx < converted; idx++) { + tempHash = tempHash.xor(BigInteger.valueOf((long) ((int) bytes[idx] & 0xff))); + tempHash = tempHash.multiply(prime).mod(mod); + } + + return tempHash; + } + } + +} diff --git a/src/main/java/org/logstash/filewatch/RubyWinIO.java b/src/main/java/org/logstash/filewatch/RubyWinIO.java new file mode 100644 index 00000000..c6cfbe4a --- /dev/null +++ b/src/main/java/org/logstash/filewatch/RubyWinIO.java @@ -0,0 +1,65 @@ +package org.logstash.filewatch; + +import jnr.posix.HANDLE; +import jnr.posix.JavaLibCHelper; +import org.jruby.Ruby; +import org.jruby.RubyBoolean; +import org.jruby.RubyIO; +import org.jruby.anno.JRubyClass; +import org.jruby.anno.JRubyMethod; +import org.jruby.runtime.ThreadContext; +import org.jruby.runtime.builtin.IRubyObject; +import org.jruby.util.io.OpenFile; + +import java.nio.channels.Channel; + +@JRubyClass(name = "WinIO") +public class RubyWinIO extends RubyIO { + private boolean valid; + private boolean direct; + private long address; + + public RubyWinIO(Ruby runtime, Channel channel) { + super(runtime, channel); + final OpenFile fptr = getOpenFileChecked(); + final boolean locked = fptr.lock(); + try { + fptr.checkClosed(); + final HANDLE handle = JavaLibCHelper.gethandle(JavaLibCHelper.getDescriptorFromChannel(fptr.fd().chFile)); + if (handle.isValid()) { + direct = handle.toPointer().isDirect(); + address = handle.toPointer().address(); + valid = true; + } else { + direct = false; + address = 0L; + valid = false; + } + } finally { + if (locked) { + fptr.unlock(); + } + } + } + + @JRubyMethod(name = "valid?") + public RubyBoolean valid_p(ThreadContext context) { + return context.runtime.newBoolean(valid); + } + + @Override + @JRubyMethod + public IRubyObject close() { + direct = false; + address = 0L; + return super.close(); + } + + final public boolean isDirect() { + return direct; + } + + final public long getAddress() { + return address; + } +} diff --git a/src/main/java/org/logstash/filewatch/WatchedFilesCollection.java b/src/main/java/org/logstash/filewatch/WatchedFilesCollection.java new file mode 100644 index 00000000..51fa8a97 --- /dev/null +++ b/src/main/java/org/logstash/filewatch/WatchedFilesCollection.java @@ -0,0 +1,235 @@ +package org.logstash.filewatch; + +import org.jruby.Ruby; +import org.jruby.RubyArray; +import org.jruby.RubyBoolean; +import org.jruby.RubyClass; +import org.jruby.RubyFloat; +import org.jruby.RubyHash; +import org.jruby.RubyObject; +import org.jruby.RubyString; +import org.jruby.anno.JRubyMethod; +import org.jruby.runtime.Block; +import org.jruby.runtime.ThreadContext; +import org.jruby.runtime.Visibility; +import org.jruby.runtime.builtin.IRubyObject; +import org.jruby.runtime.callsite.CachingCallSite; +import org.jruby.runtime.callsite.FunctionalCachingCallSite; + +import java.util.Comparator; +import java.util.SortedMap; +import java.util.TreeMap; + +/** + * FileWatch::WatchedFilesCollection for managing paths mapped to (watched) files. + * + * Implemented in native to avoid Ruby->Java type casting (which JRuby provides no control of as of 9.2). + * The collection already has a noticeable footprint when 10_000s of files are being watched at once, having + * the implementation in Java reduces 1000s of String conversions on every watch re-stat tick. + */ +public class WatchedFilesCollection extends RubyObject { + + // we could have used Ruby's SortedSet but it does not provide support for custom comparators + private SortedMap files; // FileWatch::WatchedFile -> String + private RubyHash filesInverse; // String -> FileWatch::WatchedFile + private String sortBy; + + public WatchedFilesCollection(Ruby runtime, RubyClass metaClass) { + super(runtime, metaClass); + } + + static void load(Ruby runtime) { + runtime.getOrCreateModule("FileWatch") + .defineClassUnder("WatchedFilesCollection", runtime.getObject(), WatchedFilesCollection::new) + .defineAnnotatedMethods(WatchedFilesCollection.class); + } + + @JRubyMethod + public IRubyObject initialize(final ThreadContext context, IRubyObject settings) { + final String sort_by = settings.callMethod(context, "file_sort_by").asJavaString(); + final String sort_direction = settings.callMethod(context, "file_sort_direction").asJavaString(); + + Comparator comparator; + switch (sort_by) { + case "last_modified" : + sortBy = "modified_at"; + comparator = (file1, file2) -> { + if (file1 == file2) return 0; // fast shortcut + RubyFloat mtime1 = modified_at(context, file1); + RubyFloat mtime2 = modified_at(context, file2); + int cmp = Double.compare(mtime1.getDoubleValue(), mtime2.getDoubleValue()); + // if mtime same (rare unless file1 == file2) - order consistently + if (cmp == 0) return path(context, file1).op_cmp(path(context, file2)); + return cmp; + }; + break; + case "path" : + sortBy = "path"; + comparator = (file1, file2) -> path(context, file1).op_cmp(path(context, file2)); + break; + default : + throw context.runtime.newArgumentError("sort_by: '" + sort_by + "' not supported"); + } + switch (sort_direction) { + case "asc" : + // all good - comparator uses ascending order + break; + case "desc" : + comparator = comparator.reversed(); + break; + default : + throw context.runtime.newArgumentError("sort_direction: '" + sort_direction + "' not supported"); + } + + this.files = new TreeMap<>(comparator); + this.filesInverse = RubyHash.newHash(context.runtime); + + // variableTableStore("@files", JavaUtil.convertJavaToRuby(context.runtime, this.files)); + // variableTableStore("@files_inverse", this.filesInverse); + + return this; + } + + @JRubyMethod + public IRubyObject add(ThreadContext context, IRubyObject file) { + RubyString path = getFilePath(context, file); + synchronized (this) { + RubyString prev_path = this.files.put(file, path); + assert prev_path == null || path.equals(prev_path); // file's path should not change! + this.filesInverse.op_aset(context, path, file); + } + return path; + } + + private static RubyString getFilePath(ThreadContext context, IRubyObject file) { + IRubyObject path = file.callMethod(context, "path"); + if (!(path instanceof RubyString)) { + throw context.runtime.newTypeError("expected file.path to return String but did not file: " + file.inspect()); + } + if (!path.isFrozen()) path = ((RubyString) path).dupFrozen(); // path = path.dup.freeze + return (RubyString) path; + } + + @JRubyMethod + public IRubyObject remove_paths(ThreadContext context, IRubyObject arg) { + IRubyObject[] paths; + if (arg instanceof RubyArray) { + paths = ((RubyArray) arg).toJavaArray(); + } else { + paths = new IRubyObject[] { arg }; + } + + int removedCount = 0; + synchronized (this) { + for (final IRubyObject path : paths) { + if (removePath(context, path.convertToString())) removedCount++; + } + } + return context.runtime.newFixnum(removedCount); + } + + private boolean removePath(ThreadContext context, RubyString path) { + IRubyObject file = this.filesInverse.delete(context, path, Block.NULL_BLOCK); + if (file.isNil()) return false; + return this.files.remove(file) != null; + } + + @JRubyMethod // synchronize { @files_inverse[path] } + public synchronized IRubyObject get(ThreadContext context, IRubyObject path) { + return this.filesInverse.op_aref(context, path); + } + + @JRubyMethod // synchronize { @files.size } + public synchronized IRubyObject size(ThreadContext context) { + return context.runtime.newFixnum(this.files.size()); + } + + @JRubyMethod(name = "empty?") // synchronize { @files.empty? } + public synchronized IRubyObject empty_p(ThreadContext context) { + return context.runtime.newBoolean(this.files.isEmpty()); + } + + @JRubyMethod + public synchronized IRubyObject each_file(ThreadContext context, Block block) { + for (IRubyObject watched_file : this.files.keySet()) { + block.yield(context, watched_file); + } + return context.nil; + } + + @JRubyMethod // synchronize { @files.values.to_a } + public IRubyObject paths(ThreadContext context) { + IRubyObject[] values; + synchronized (this) { + values = this.files.values().stream().toArray(IRubyObject[]::new); + } + return context.runtime.newArrayNoCopy(values); + } + + // NOTE: needs to return properly ordered files (can not use @files_inverse) + @JRubyMethod // synchronize { @files.key_set.to_a } + public IRubyObject files(ThreadContext context) { + IRubyObject[] keys; + synchronized (this) { + keys = this.files.keySet().stream().toArray(IRubyObject[]::new); + } + return context.runtime.newArrayNoCopy(keys); + } + + + @JRubyMethod + public IRubyObject update(ThreadContext context, IRubyObject file) { + // NOTE: modified_at might change on restat - to cope with that we need to potentially + // update the sorted collection, on such changes (when file_sort_by: last_modified) : + if (!"modified_at".equals(sortBy)) return context.nil; + + RubyString path = getFilePath(context, file); + synchronized (this) { + this.files.remove(file); // we need to "re-sort" changed file -> remove and add it back + modified_at(context, file, context.tru); // file.modified_at(update: true) + RubyString prev_path = this.files.put(file, path); + assert prev_path == null; + } + return context.tru; + } + + @JRubyMethod(required = 1, visibility = Visibility.PRIVATE) + @Override + public IRubyObject initialize_copy(IRubyObject original) { + final Ruby runtime = getRuntime(); + if (!(original instanceof WatchedFilesCollection)) { + throw runtime.newTypeError("Expecting an instance of class WatchedFilesCollection"); + } + + WatchedFilesCollection proto = (WatchedFilesCollection) original; + + this.files = new TreeMap<>(proto.files.comparator()); + synchronized (proto) { + this.files.putAll(proto.files); + this.filesInverse = (RubyHash) proto.filesInverse.dup(runtime.getCurrentContext()); + } + + return this; + } + + @Override + public IRubyObject inspect() { + return getRuntime().newString("#<" + metaClass.getRealClass().getName() + ": size=" + this.files.size() + ">"); + } + + private static final CachingCallSite modified_at_site = new FunctionalCachingCallSite("modified_at"); + private static final CachingCallSite path_site = new FunctionalCachingCallSite("path"); + + private static RubyString path(ThreadContext context, IRubyObject watched_file) { + return path_site.call(context, watched_file, watched_file).convertToString(); + } + + private static RubyFloat modified_at(ThreadContext context, IRubyObject watched_file) { + return modified_at_site.call(context, watched_file, watched_file).convertToFloat(); + } + + private static RubyFloat modified_at(ThreadContext context, IRubyObject watched_file, RubyBoolean update) { + return modified_at_site.call(context, watched_file, watched_file, update).convertToFloat(); + } + +}