/*
 * Logserver
 * Copyright (C) 2017-2025 Joel Reardon
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */

#ifndef __LINE_INTELLIGENCE__H__
#define __LINE_INTELLIGENCE__H__

#include <iostream>
#include <memory>
#include <string>
#include <vector>

#include "base64.h"
#include "constants.h"
#include "line.h"
#include "tokenizer.h"
#include "zcat.h"

using namespace std;

/* LineIntelligence is a static class that performs heuristics on lines to
 * decide whether thnigs can be decoder, such as hex encoding, base64 encoding,
 * timestamp to human time, etc.
 */
class LineIntelligence {
public:
	/* make a 4 spaces times input arg */
	static string make_tabs(size_t tabs) {
		string ret;
		for (size_t i = 0; i < tabs; ++i) {
			ret += "    ";
		}
		return ret;
	}

	/* Indent format brace-structured code */
	/* TODO: this is just to have something now, but obviously a better
	 * parsing of the language tokens is needed to make it pretty */
	static void process_code(const string_view& data,
				 list<string>* tokens) {
		string quotes;
		Tokenizer::annotate_quote(data, &quotes);

		size_t tabs = 0;
		size_t last = 0;
		size_t cur = 0;
		assert(data.size() == quotes.size());
		while (cur < data.size()) {
			if (quotes[cur] == '\'') {
				// ignore punctuation
			} else {
				char c = data[cur];
				if (c == ';' || c == '{' || c == '}') {
					if (c == '}' && tabs) --tabs;
					tokens->push_back(
						make_tabs(tabs) + string(data.substr(
							last, cur + 1 - last)));
					last = cur + 1;
					if (c == '{') ++tabs;
				}
			}
			++cur;
		}
		if (last != cur) {
			tokens->push_back(string(data.substr(last)));
		}
	}

	/* pretty print JSON data into a lines with tabs */
	static void process_json(const string_view& data,
				 list<string>* tokens) {
		string quotes;
		Tokenizer::annotate_quote(data, &quotes);

		size_t tabs = 0;
		size_t last = 0;
		size_t cur = 0;
		assert(data.size() == quotes.size());
		while (cur < data.size()) {
			if (quotes[cur] == '\'') {
				// ignore punctuation in quoted segments
			} else {
				char c = data[cur];
				// open of array or object or end of current
				if (c == '[' || c == '{' ||
				    c == ',') {
					string token = make_tabs(tabs);
					token += Tokenizer::trim(
						data.substr(last, cur + 1 - last));
					tokens->push_back(token);
					last = cur + 1;
					if (c == '{' || c == '[') ++tabs;
				// close of array or object
				} else if (c == '}' || c == ']') {
					if (last < cur) {
						string token = make_tabs(tabs);
						token += Tokenizer::trim(
							data.substr(
								last,
								cur - last));
						tokens->push_back(token);
					}
					if (cur + 1 < data.size() &&
					    data[cur + 1] == ',') {
						++cur;
					}
					last = cur;
					if (tabs) --tabs;
					string suffix = make_tabs(tabs);
					suffix += Tokenizer::trim(
						data.substr(last,
							    cur + 1 - last));
					tokens->push_back(suffix);
					last = cur + 1;
				}
			}
			++cur;
		}
		if (last != cur) {
			tokens->push_back(string(data.substr(last)));
		}
	}

	/* split a string by deliminator intok a list of tokens */
	static void split_with_empty(const string_view& data, const string& deliminator,
				     list<string_view>* tokens) {
		size_t pos = 0;
		while (true) {
			size_t start = pos;
			pos = data.find(deliminator, start);
			if (pos == string::npos) {
				tokens->push_back(data.substr(start));
				break;
			}
			tokens->push_back(data.substr(start, pos - start));
			pos += deliminator.length();
		}
	}

	/* find and replace all occurances on a string and return result */
	static string replace(const string_view& data, const string& find,
			      const string& replacement) {
		list<string_view> pieces;
		split_with_empty(data, find, &pieces);
		if (pieces.size() < 2) return string(data);
		stringstream ss;
		auto it = pieces.begin();
		ss << *it;
		++it;
		while (it != pieces.end()) {
		     ss << replacement << *it;
		     ++it;
		}
		return ss.str();
	}

	static bool heuristic_hex_unescape(const string_view& data) {
		int hits = 0;
		int misses = 0;
		for (size_t i = 0; i < data.length(); ++i) {
			if (data[i] == '%' && i + 2 < data.length()) {
				if (isxdigit(data[i + 1]) &&
				    isxdigit(data[i + 2])) {
					hits += 1;
				} else {
					misses += 1;
				}
			}
		}
		if (!misses && hits) return true;
		if (hits > 10 && misses < 4) return true;
		return false;
	}

	static string hex_unescape(const string_view& data) {
		string ret;
		for (size_t i = 0; i < data.length(); ++i) {
			if (data[i] == '%' && i + 2 < data.length()) {
				if (isxdigit(data[i + 1]) &&
				    isxdigit(data[i + 2])) {
					ret += dehexify(data.substr(i + 1, 2));
					i += 2;
					continue;
				}
			}
			ret += data[i];
		}
		return ret;
	}

	/* main function to split a string and return the result as new lines to
	 * insert into the log_lines object. If param c is set, then it splits
	 * using that character as the line break (i.e., if user specifies what
	 * char to use). Otherwise it applies heuristics: does this look like
	 * json, code, http query, etc., then split appropriately. Otherwise
	 * count occurances of possible delimiters, such as comma, \n, |, etc.,
	 * and split on that. Finally just split with a reasonable ragged right
	 * to avoid breaking up words. */
	// TODO: pass in as Line and preserve the node for GEB
	static list<unique_ptr<Line>> split(const string_view& line, char c) {
		// depending on the type of splitting we do, either we have
		// string_views or strings, but not both.
		list<string_view> to_add;
		list<string> string_to_add;
		// list of new Line objects to add
		list<unique_ptr<Line>> ret;
		if (line.empty()) {
			// TODO: no make
			ret.emplace_back(make_unique<Line>(line));
			return ret;
		}

		/* if we know what char to use, then do that */
		if (c != '\0') {
			split_with_empty(line, string(1, c), &to_add);
		/* else apply heuristics */
 		} else {
			map<string, size_t> counts;
			count_occurrences(line, &counts);

			if (heuristic_newlines(line, counts)) {
				split_with_empty(line, "\\n", &to_add);
			} else if (heuristic_httparg(line, counts)) {
				split_with_empty(line, "&", &to_add);
			} else if (heuristic_pipe(line, counts)) {
				split_with_empty(line, "|", &to_add);
			} else if (heuristic_json(line, counts)) {
				process_json(line, &string_to_add);
			} else if (heuristic_code(line, counts)) {
				process_code(line, &string_to_add);
			}

			// if no heuristic triggered then both lists will be
			// empty. If a heuristic did trigger, but triggering
			// didn't result in it actually adding new lines, then
			// one of the lists is empty and the other contains one
			// element. Either way we need to break it the old
			// fashion way.
			if (string_to_add.size() + to_add.size() <= 1) {
				ret.clear();
				to_add.clear();
				string_to_add.clear();

				// TODO: use actual screen width instead
				size_t i = G::LINE_WIDTH;
				string_view cur = line;
				while (i < cur.length()) {
					i = rewind(cur, i, 20);
					to_add.emplace_back(cur.substr(0, i + 1));
					cur = cur.substr(i + 1);
					i = G::LINE_WIDTH;
				}
				if (!cur.empty()) to_add.emplace_back(cur);
			}
		}
		for (const auto& x : to_add) {
			ret.push_back(make_unique<Line>(x));
		}
		for (const auto& x : string_to_add) {
			ret.push_back(make_unique<Line>(x));
		}
		return ret;
	}

	/* find a nice place to break a string, given that we want to break it
	 * at the parameter pos position. look back the parameter amount
	 * positions in the string for an obvious break pos */
	static size_t rewind(const string_view& line,
			     size_t pos,
			     size_t amount) {
		static set<char> good_break = {
			'&', ' ', '-', '_', '#', ',', '\t',
			')', '(', '{', '}', '<', '>', ':',
			';', '!', '?', '.', '\"', '\'', '\n'};

		assert(pos < line.length());
		assert(amount < pos);
		for (size_t i = 0; i < amount; ++i) {
			 char c = line[pos - i];
			 if (good_break.count(c)) return pos -i;
		}
		return pos;
	}

	/* given a line, try to see if there are ways we can decode it, such as
	 * base64, base16, and unix timestamps to human. the parameter
	 * frustration is the number of times the user has repeated tried to
	 * call this function, so loosen the heuristics for success to encourage
	 * trying harder */
	static optional<string> apply_heuristics(const string_view& line,
						 int frustration) {
		// ret is what is returned, cur is the result of current
		// heuristic, which is nullopt if not applicable.
		optional<string> ret = nullopt;
		optional<string> cur = nullopt;

		ret = LineIntelligence::useful_timestamp(line);
		cur = LineIntelligence::useful_base16(
			ret ? *ret : string(line), frustration);
		if (cur) ret = cur;
		cur = LineIntelligence::useful_base64(
			ret ? *ret : string(line), frustration);
		if (cur) ret = cur;
		if (heuristic_hex_unescape(ret ? *ret : line)) {
			cur = LineIntelligence::hex_unescape(
				ret ? *ret : line);
		}
		if (cur) ret = std::move(cur);
		return ret;
	}

	/* looks for unix timestamps in line and returns them converted to human
	 * time */
	static optional<string> useful_timestamp(const string_view& line) {
		regex r_ts("([^0-9]|^)([0-9]{13}|[0-9]{10})($|[^0-9])");
		smatch sm;
		string copy(line);
		auto it = copy.cbegin();
		map<string, string> replaces;
		while (regex_search(it, copy.cend(), sm, r_ts)) {
			string result = ts_to_human(sm[2].str());
			if (!result.empty()) replaces[sm[2].str()] = result;
			it += sm.position() + 10;
		}
		return apply_matches(line, replaces);
	}

	/* takes a string and returns a dictionary of find-replace pairs applied
	 * to it. returns nullopt if there is nothing to replace */
	static optional<string> apply_matches(
			const string_view& line,
			const map<string, string>& replaces) {
		if (replaces.empty()) return nullopt;
		string ret;
		bool first = true;
		for (const auto &x: replaces) {
			assert(line.find(x.first) != string::npos);
			if (first) {
				ret = replace(line, x.first, x.second);
				first = false;
			} else {
				ret = replace(ret, x.first, x.second);
			}
		}
		assert(ret != line);
		return ret;
	}

	/* hex decode the parameter in_line if heuristics pass. return nullopt
	 * if no hex was found */
	static optional<string> useful_base16(const string& in_line,
						     int frustration) {
		string line = " " + in_line + " ";
		regex r_b64("[ x=:;,.'\\\"\\t\\r]([A-Fa-f0-9]+)[ :;,.'\\\"\\t\\r]");
		smatch sm;
		auto it = line.cbegin();
		map<string, string> replaces;
		if (frustration > 4) frustration = 4;
		assert(frustration >= 0);
		static const size_t minlen[] = {7, 7, 5, 5, 3};
		static const size_t minpercent[] = {100, 95, 75, 50, 25};

		while (it != line.cend() && regex_search(it, line.cend(), sm, r_b64)) {
			if (sm[1].str().length() > minlen[frustration]) {
				// dehex the finding, if it is gzipped ungzip
				// it. then check against the heuristic of
				// printable to keep it
				string result = dehexify(sm[1].str());
				if (ZCat::magic(result))
					result = ZCat::zcat(result);
				if (percent_printable(result, false) <
				    minpercent[frustration])
					result = "";

				if (!result.empty()) replaces[sm[1].str()] = result;
			}
			for (size_t i = 0; i < 4; ++i) {
				if (it != line.cend()) ++it;
			}
		}

		if (replaces.empty()) {
			return nullopt;
		}

		for (const auto &x: replaces) {
			line = replace(line, x.first, x.second);
		}
		assert(line.size() >= 2);
		return line.substr(1, line.length() - 2);
	}

	/* dehex the string data, return empty string if the percent printable
	 * is less then parameter percent */
	static string dehexify(const string &data, size_t percent) {
		string ret = dehexify(data);
		if (percent_printable(ret, false) >= percent) return ret;
		return "";
	}

	/* dehex the input string data */
	static string dehexify(const string_view& data) {
		if (data.length() % 2) {
			string tmp = "0" + string(data);
			return dehexify(tmp);
		}

		stringstream ssout;
		for (size_t i = 0; i < data.length(); i += 2) {
			stringstream ss;
			int value;
			ss << hex << data.substr(i, 2);
			ss >> value;
			ssout << (char) value;
		}
		return ssout.str();
	}

	/* search for base64 segments that produce printable sequences based on
	 * heuristics and parameter frustration. return nullopt if nothing found
	 * to decode */
	static optional<string> useful_base64(const string& in_line,
					      int frustration) {
		string line = " " + in_line + " ";
		regex r_b64("[ =:;,.'\\\"\\t\\r]([-A-Za-z0-9+/_\\\\]+={0,2})[ :;,.'\\\"\\t\\r]");
		smatch sm;
		auto it = line.cbegin();
		map<string, string> replaces;
		if (frustration > 4) frustration = 4;
		assert(frustration >= 0);
		static const size_t minlen[] = {7, 7, 5, 5, 3};
		static const size_t minpercent[] = {100, 95, 75, 50, 25};

		while (it != line.cend() && regex_search(it, line.cend(), sm, r_b64)) {
			// TODO constify the 7, min length of base 64
			if (sm[1].str().length() > minlen[frustration]) {
				string result = b64_nonbinary(sm[1].str(),
							      minpercent[frustration]);
				if (!result.empty()) replaces[sm[1].str()] = result;
			}
			for (size_t i = 0; i < 4; ++i) {
				if (it != line.cend()) ++it;
			}
		}
		if (replaces.empty()) return nullopt;
		for (const auto &x: replaces) {
			line = replace(line, x.first, x.second);
		}
		assert(line.length() >= 2);
		return line.substr(1, line.length() - 2);
	}

	/* base64 decode the string parameter s and return the result if more
	 * than minpercent percent of the characters are printable. also gunzips
	 * it if it is clearly gzipped data */
	static string b64_nonbinary(const string& s, size_t minpercent) {
		string val = replace(s, "\\n", "");
		for (size_t i = 0; i < 4; ++i) {
			// only for long base64 segments
			if (s.size() < 15 && i) break;
			string ret = ::Base64::decode(val);
			// TODO: instead insert as new lines
			if (ZCat::magic(ret)) ret = ZCat::zcat(ret);
			if (percent_printable(ret, true) >= minpercent) return ret;
			val = "A" + val;
		}
		return "";
	}

	/* returns the percent of printable characters in s, used to assess if
	 * base64 decoding was somewhat reasonable */
	static size_t percent_printable(const string& s, bool ignore_start) {
		if (s.empty()) return 100;
		size_t count = 0;
		size_t pos = 0;
		if (ignore_start) {
			while (!isspace(s[pos]) && !isprint(s[pos])) ++pos;
		}
		if (pos == s.size()) return 0;
		size_t start_pos = pos;
		while (pos < s.size()) {
			if (isspace(s[pos]) || isprint(s[pos])) {
				++count;
			}
			++pos;
		}

		return count * 100 / (s.length() - start_pos);
	}

	/* returns the count of the char needle in the string haystack */
	static size_t count_occurrences(const string_view& haystack,
					char needle) {
		size_t ret = 0;
		for (size_t i = 0; i < haystack.length(); ++i) {
			if (haystack[i] == needle) ++ret;
		}
		return ret;
	}

	/* returns the count of the string needle in the string haystack. it
	 * shifts search by needle size and does not count substrings giving
	 * multiple matches */
	static size_t count_occurrences(const string_view& haystack,
					const string& needle) {
		size_t i = 0;
		size_t pos = 0;
		while (true) {
			pos = haystack.find(needle, pos);
			if (pos == string::npos) return i;
			++i;
			pos += needle.size();
		}
	}

	/* takes a string haystack and counts the number of 1 or 2 character
	 * sequences. this is used for heuristics like does this string have
	 * many \n sequences, or other types of punctuation. the counts map has
	 * to have an entry for the sequence, otherwise it is ignored
	 */
	static void count_occurrences(const string_view& haystack,
					map<string, size_t>* counts) {
		size_t max_len = 0;
		if (!counts) return;
		if (counts->empty()) {
			*counts = {
				{"\\n", 0},
				{"&", 0},
				{"|", 0},
				{"=", 0},
				{"!=", 0},
				{"==", 0},
				{"{", 0},
				{"}", 0},
				{";", 0},
				{",", 0},
				{":", 0},
				{"[", 0},
				{"]", 0},
				{"(", 0},
				{")", 0},
				{"\"", 0}
			};
			max_len = 2;
		} else {
			for (const auto& x : *counts) {
				if (x.first.length() > max_len) {
					max_len = x.first.length();
				}
			}
		}
		for (size_t i = 0; i < haystack.length(); ++i) {
			for (size_t j = 1; j <= max_len; ++j) {
				string key(haystack.substr(i, j));
				if (counts->count(key)) {
					(*counts)[key]++;
				}
			}
		}
	}

	/* simple similarity heuristic for two counts. */
	static bool similar(const map<string, size_t>& counts,
			    const string& one, const string& two) {
		size_t c1 = counts.at(one);
		size_t c2 = counts.at(two);
		// equal or no matches
		if (c1 == c2) return true;
		// none of one or other
		if (!c1 || !c2) return false;

		if (c1 > c2) {
			c1 = c2;
			c2 = counts.at(one);
		}
		// equal from above
		assert(c1 < c2);

		c1 += 0.15 * c2 + 5;
		return (c1 > c2);
	}

	/* returns true if the string is likely JSON */
	static bool heuristic_json([[maybe_unused]] const string_view& line,
				   const map<string, size_t>& counts) {
		// only process JSON arrays and objects
		if (counts.at("[") == 0 && counts.at("{") == 0) return false;
		if (!similar(counts, "[", "]")) return false;
		if (!similar(counts, "{", "}")) return false;
		size_t quotes = counts.at("\"");
		if (quotes > 2 * counts.at("{")) return true;
		if (quotes > 2 * counts.at(":")) return true;
		if (quotes > 2 * counts.at(",")) return true;
		return false;
	}

	/* returns true if the string could be brace and semicolon style code */
	static bool heuristic_code([[maybe_unused]] const string_view& line,
				   const map<string, size_t>& counts) {
		if (counts.at(";") > 100 && counts.at("{") > 10
		    && counts.at("}") > 10 && counts.at("=") > 20)
			return true;
		return false;
	}

	/* returns true if the string has lots of \n in it */
	static bool heuristic_newlines(const string_view& line,
				       const map<string, size_t>& counts) {
		size_t len = line.length();
		size_t newlines = counts.at("\\n");
		return (newlines * 100 / len);
	}

	/* returns true if the string looks like it is |-separated */
	static bool heuristic_pipe(const string_view& line,
				   const map<string, size_t>& counts) {
		size_t len = line.length();
		size_t pipes = counts.at("|");
		return (pipes * 150 / len);
	}

	/* returns true if the string looks like an http query string */
	static bool heuristic_httparg(const string_view& line,
				      const map<string, size_t>& counts) {
		size_t len = line.length();
		size_t ampers = counts.at("&");
		if (ampers < 3) return false;
		size_t equals = counts.at("=");
		if (ampers * 50 / len) return true;
		if (equals <= ampers + 5 && 2 * equals > ampers) return true;
		return false;
	}

	/* convert unix time to human time */
	static string ts_to_human(const string& timestamp) {
		struct timeval tv;
		gettimeofday(&tv, nullptr);
		size_t ts = tv.tv_sec;
		size_t ours = 0;
		time_t ours_parm = 0;
		size_t ours_s = 0;

		try {
			ours = stoull(timestamp);
		} catch (const logic_error& e) {
			return "";
		}
		if (timestamp.length() == 16) {
			ours_s = ours / 1000000;
		} else if (timestamp.length() == 13) {
			ours_s = ours / 1000;
		} else {
			assert(timestamp.length() == 10);
			ours_s = ours;
		}
		string result;
		if ((ts >= ours_s && ts < ours_s + 315360000) ||
		   (ours_s >= ts && ours_s < ts + 315360000)) {
			// within a decade
			ours_parm = static_cast<time_t>(ours_s);
			result = asctime(gmtime(&ours_parm));
			if (result.length()) result = result.substr(0, result.length() - 1);
			if (result.length() > 6) {
				stringstream ss_ms;
				ss_ms << (ours % 1000);
				result = result.substr(0, result.length() - 5)
					+ "." + ss_ms.str() +
					result.substr(result.length() - 5);
			}
		}
		return result;
	}
};

#endif  // __LINE_INTELLIGENCE__H__
