-
Notifications
You must be signed in to change notification settings - Fork 45
Expand file tree
/
Copy pathcommonutils.cpp
More file actions
230 lines (202 loc) · 6.48 KB
/
Copy pathcommonutils.cpp
File metadata and controls
230 lines (202 loc) · 6.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
/*******************************************************************************
* RetroShare full text indexing and search implementation based on Xapian *
* *
* Copyright (C) 2018-2021 Gioacchino Mazzurco <gio@eigenlab.org> *
* Copyright (C) 2019-2021 Asociación Civil Altermundi <info@altermundi.net> *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Affero General Public License version 3 as *
* published by the Free Software Foundation. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Affero General Public License for more details. *
* *
* You should have received a copy of the GNU Affero General Public License *
* along with this program. If not, see <https://www.gnu.org/licenses/>. *
* *
*******************************************************************************/
#include <algorithm>
#include <thread>
#include "deep_search/commonutils.hpp"
#include "util/stacktrace.h"
#include "util/rsthreads.h"
#include "util/rsdebuglevel0.h"
namespace DeepSearch
{
std::string simpleTextHtmlExtract(const std::string& rsHtmlDoc)
{
if(rsHtmlDoc.empty()) return rsHtmlDoc;
const bool isPlainMsg =
rsHtmlDoc[0] != '<' || rsHtmlDoc[rsHtmlDoc.size() - 1] != '>';
if(isPlainMsg) return rsHtmlDoc;
auto oSize = rsHtmlDoc.size();
auto bodyTagBegin(rsHtmlDoc.find("<body"));
if(bodyTagBegin >= oSize) return rsHtmlDoc;
auto bodyTagEnd(rsHtmlDoc.find(">", bodyTagBegin));
if(bodyTagEnd >= oSize) return rsHtmlDoc;
std::string retVal(rsHtmlDoc.substr(bodyTagEnd+1));
// strip also CSS inside <style></style>
oSize = retVal.size();
auto styleTagBegin(retVal.find("<style"));
if(styleTagBegin < oSize)
{
auto styleEnd(retVal.find("</style>", styleTagBegin));
if(styleEnd < oSize)
retVal.erase(styleTagBegin, 8+styleEnd-styleTagBegin);
}
std::string::size_type oPos;
std::string::size_type cPos;
int itCount = 0;
while((oPos = retVal.find("<")) < retVal.size())
{
if((cPos = retVal.find(">")) <= retVal.size())
retVal.erase(oPos, 1+cPos-oPos);
else break;
// Avoid infinite loop with crafty input
if(itCount > 1000)
{
RS_WARN( "Breaking stripping loop due to max allowed iterations ",
"rsHtmlDoc: ", rsHtmlDoc, " retVal: ", retVal );
break;
}
++itCount;
}
return retVal;
}
}
// Xapian-specific code (only for channels/files indexing)
#if defined(RS_DEEP_CHANNEL_INDEX) || defined(RS_DEEP_FILES_INDEX)
#ifndef XAPIAN_AT_LEAST
/// Added in Xapian 1.4.2.
#define XAPIAN_AT_LEAST(A,B,C) \
(XAPIAN_MAJOR_VERSION > (A) || \
(XAPIAN_MAJOR_VERSION == (A) && \
(XAPIAN_MINOR_VERSION > (B) || \
(XAPIAN_MINOR_VERSION == (B) && XAPIAN_REVISION >= (C)))))
#endif
namespace DeepSearch
{
std::unique_ptr<Xapian::Database> openReadOnlyDatabase(
const std::string& path, int flags )
{
try
{
#if XAPIAN_AT_LEAST(1,3,2)
std::unique_ptr<Xapian::Database> dbPtr(
new Xapian::Database(path, flags) );
#else
std::unique_ptr<Xapian::Database> dbPtr(new Xapian::Database(path));
if(flags)
{
RS_WARN( "Xapian DB flags: ", flags, " ignored due to old Xapian "
"library version: ", XAPIAN_VERSION, " < 1.3.2" );
}
#endif
return dbPtr;
}
catch(Xapian::DatabaseOpeningError& e)
{
RsWarn() << __PRETTY_FUNCTION__ << " " << e.get_msg()
<< ", probably nothing has been indexed yet." << std::endl;
}
catch(Xapian::DatabaseLockError&)
{
RsErr() << __PRETTY_FUNCTION__ << " Failed aquiring Xapian DB lock "
<< path << std::endl;
print_stacktrace();
}
catch(...)
{
RsErr() << __PRETTY_FUNCTION__ << " Xapian DB is apparently corrupted "
<< "deleting it might help without causing any harm: "
<< path << std::endl;
print_stacktrace();
}
return nullptr;
}
std::string timetToXapianDate(const rstime_t& time)
{
char date[] = "YYYYMMDD\0";
time_t tTime = static_cast<time_t>(time);
std::strftime(date, 9, "%Y%m%d", std::gmtime(&tTime));
return date;
}
StubbornWriteOpQueue::~StubbornWriteOpQueue()
{
auto fErr = flush(0);
if(fErr)
{
RS_FATAL( "Flush failed on destruction ", mOpStore.size(),
" operations irreparably lost ", fErr );
print_stacktrace();
}
}
void StubbornWriteOpQueue::push(write_op op)
{
RS_DBG4("");
{
std::unique_lock<std::mutex> lock(mQueueMutex);
mOpStore.push(op);
}
flush();
}
std::error_condition StubbornWriteOpQueue::flush(
rstime_t acceptDelay, rstime_t callTS )
{
RS_DBG4("");
{
// Return without attempt to open the database if the queue is empty
std::unique_lock<std::mutex> lock(mQueueMutex);
if(mOpStore.empty()) return std::error_condition();
}
std::unique_ptr<Xapian::WritableDatabase> dbPtr;
try
{
dbPtr = std::make_unique<Xapian::WritableDatabase>(
mDbPath, Xapian::DB_CREATE_OR_OPEN );
}
catch(Xapian::DatabaseLockError)
{
if(acceptDelay)
{
rstime_t tNow = time(nullptr);
rstime_t maxRemaining = tNow - (callTS + acceptDelay);
if(maxRemaining > 0)
{
std::chrono::milliseconds interval(
std::max(rstime_t(50), maxRemaining*1000/5) );
RS_DBG3( "Cannot acquire database write lock, retrying in:",
interval.count(), "ms" );
RsThread::async([this, acceptDelay, callTS, interval]()
{
std::this_thread::sleep_for(interval);
flush(acceptDelay, callTS);
});
return std::error_condition();
}
else
{
RS_ERR(std::errc::timed_out, acceptDelay, callTS, tNow);
return std::errc::timed_out;
}
}
else return std::errc::resource_unavailable_try_again;
}
catch(...)
{
RS_ERR("Xapian DB ", mDbPath, " is apparently corrupted");
print_stacktrace();
return std::errc::io_error;
}
std::unique_lock<std::mutex> lock(mQueueMutex);
while(!mOpStore.empty())
{
auto op = mOpStore.front(); mOpStore.pop();
op(*dbPtr);
}
return std::error_condition();
}
}
#endif // RS_DEEP_CHANNEL_INDEX || RS_DEEP_FILES_INDEX