|
|
@@ -32,20 +32,22 @@ public class OCRUtils {
|
|
32
|
32
|
, "委托代理人", "联系电话", "电子邮件", "被申请人", "居民身份证号码", "仲裁请求", "事实和理由"};
|
|
33
|
33
|
// 贷款合同识别字段
|
|
34
|
34
|
private static final String[] contractName = {
|
|
35
|
|
- "合同编号","甲方(贷款人)" ,"或委托代理人签字:","本合同的初始贷款年利率为","乙方确认有效的电子信箱地址为"};
|
|
|
35
|
+ "合同编号", "甲方(贷款人)", "或委托代理人签字:", "本合同的初始贷款年利率为", "乙方确认有效的电子信箱地址为"};
|
|
36
|
36
|
// 调解协议识别字段
|
|
37
|
|
- private static final String[] accordName = {"金融消费纠纷基本情况","经调解,双方自愿达成如下协议"};
|
|
|
37
|
+ private static final String[] accordName = {"金融消费纠纷基本情况", "经调解,双方自愿达成如下协议"};
|
|
38
|
38
|
// 授权委托书识别字段
|
|
39
|
39
|
private static final String[] powerAttorneyName = {"职务"};
|
|
|
40
|
+
|
|
40
|
41
|
/**
|
|
41
|
42
|
* pdf识别成文字
|
|
|
43
|
+ *
|
|
42
|
44
|
* @param imageBase64
|
|
43
|
45
|
* @param pageNumber
|
|
44
|
|
- * @param fatchRules 抓取规则
|
|
|
46
|
+ * @param fatchRules 抓取规则
|
|
45
|
47
|
* @return
|
|
46
|
48
|
*/
|
|
47
|
49
|
public static String pdfIdentifyText(String imageBase64, Integer pageNumber, List<FatchRule> fatchRules) {
|
|
48
|
|
- StringBuilder respStr=new StringBuilder();
|
|
|
50
|
+ StringBuilder respStr = new StringBuilder();
|
|
49
|
51
|
try {
|
|
50
|
52
|
// 实例化一个认证对象,入参需要传入腾讯云账户 SecretId 和 SecretKey,此处还需注意密钥对的保密
|
|
51
|
53
|
// 代码泄露可能会导致 SecretId 和 SecretKey 泄露,并威胁账号下所有资源的安全性。以下代码示例仅供参考,建议采用更安全的方式来使用密钥,请参见:https://cloud.tencent.com/document/product/1278/85305
|
|
|
@@ -76,17 +78,16 @@ public class OCRUtils {
|
|
76
|
78
|
for (TextDetection textDetection : textDetections) {
|
|
77
|
79
|
respStr.append(textDetection.getDetectedText());
|
|
78
|
80
|
}
|
|
79
|
|
- }catch (TencentCloudSDKException e){
|
|
|
81
|
+ } catch (TencentCloudSDKException e) {
|
|
80
|
82
|
throw new ServiceException("ocr识别失败");
|
|
81
|
83
|
}
|
|
82
|
|
- if(respStr.toString().endsWith(String.valueOf(pageNumber))){
|
|
|
84
|
+ if (respStr.toString().endsWith(String.valueOf(pageNumber))) {
|
|
83
|
85
|
int lastIndexOf = respStr.toString().lastIndexOf(String.valueOf(pageNumber));
|
|
84
|
|
- return respStr.toString().substring(0,lastIndexOf);
|
|
|
86
|
+ return respStr.toString().substring(0, lastIndexOf);
|
|
85
|
87
|
}
|
|
86
|
88
|
return respStr.toString();
|
|
87
|
89
|
|
|
88
|
90
|
|
|
89
|
|
-
|
|
90
|
91
|
//
|
|
91
|
92
|
// //解析数据
|
|
92
|
93
|
// String s = GeneralAccurateOCRResponse.toJsonString(resp);
|
|
|
@@ -121,12 +122,13 @@ public class OCRUtils {
|
|
121
|
122
|
|
|
122
|
123
|
/**
|
|
123
|
124
|
* pdf识别成文字
|
|
|
125
|
+ *
|
|
124
|
126
|
* @param imageBase64
|
|
125
|
127
|
* @param pageNumber
|
|
126
|
|
- * @param type pdf类型
|
|
|
128
|
+ * @param type pdf类型
|
|
127
|
129
|
* @return
|
|
128
|
130
|
*/
|
|
129
|
|
- public static String pdfIdentifyText1(String imageBase64, Integer pageNumber,String type) {
|
|
|
131
|
+ public static String pdfIdentifyText1(String imageBase64, Integer pageNumber, String type) {
|
|
130
|
132
|
try {
|
|
131
|
133
|
// 实例化一个认证对象,入参需要传入腾讯云账户 SecretId 和 SecretKey,此处还需注意密钥对的保密
|
|
132
|
134
|
// 代码泄露可能会导致 SecretId 和 SecretKey 泄露,并威胁账号下所有资源的安全性。以下代码示例仅供参考,建议采用更安全的方式来使用密钥,请参见:https://cloud.tencent.com/document/product/1278/85305
|
|
|
@@ -145,13 +147,13 @@ public class OCRUtils {
|
|
145
|
147
|
req.setImageBase64(imageBase64);
|
|
146
|
148
|
req.setIsPdf(true);
|
|
147
|
149
|
req.setPdfPageNumber(pageNumber.longValue());
|
|
148
|
|
- if(type.contains("申请书")){
|
|
|
150
|
+ if (type.contains("申请书")) {
|
|
149
|
151
|
req.setItemNames(applicantName);
|
|
150
|
|
- }else if(type.contains("调解协议")){
|
|
|
152
|
+ } else if (type.contains("调解协议")) {
|
|
151
|
153
|
req.setItemNames(accordName);
|
|
152
|
|
- }else if(type.contains("合同")){
|
|
|
154
|
+ } else if (type.contains("合同")) {
|
|
153
|
155
|
req.setItemNames(contractName);
|
|
154
|
|
- }else if(type.contains("授权委托书")){
|
|
|
156
|
+ } else if (type.contains("授权委托书")) {
|
|
155
|
157
|
req.setItemNames(powerAttorneyName);
|
|
156
|
158
|
}
|
|
157
|
159
|
|
|
|
@@ -190,7 +192,8 @@ public class OCRUtils {
|
|
190
|
192
|
}
|
|
191
|
193
|
return null;
|
|
192
|
194
|
}
|
|
193
|
|
- public static String pdfConvertBase64(String pathUrl){
|
|
|
195
|
+
|
|
|
196
|
+ public static String pdfConvertBase64(String pathUrl) {
|
|
194
|
197
|
try {
|
|
195
|
198
|
File file = new File(pathUrl);
|
|
196
|
199
|
FileInputStream fileInputStream = new FileInputStream(file);
|
|
|
@@ -207,33 +210,90 @@ public class OCRUtils {
|
|
207
|
210
|
|
|
208
|
211
|
/**
|
|
209
|
212
|
* 根据抓取规则获取内容
|
|
210
|
|
- * @param ocrText ocr识别的text
|
|
|
213
|
+ *
|
|
|
214
|
+ * @param ocrText ocr识别的text
|
|
211
|
215
|
* @param fatchRules 抓取规则
|
|
212
|
216
|
* @return
|
|
213
|
217
|
*/
|
|
214
|
218
|
public static void fatchRuleGetContent(String ocrText, List<FatchRule> fatchRules, Map<String, String> fatchMap) {
|
|
215
|
|
-
|
|
|
219
|
+ String text = ocrText;
|
|
216
|
220
|
if (CollectionUtil.isNotEmpty(fatchRules)) {
|
|
217
|
221
|
|
|
218
|
222
|
for (FatchRule fatchRule : fatchRules) {
|
|
219
|
|
- if (StrUtil.isEmpty(fatchRule.getStartContent())) {
|
|
220
|
|
- continue;
|
|
221
|
|
- }
|
|
|
223
|
+ // 从后往前抓取
|
|
|
224
|
+ if (fatchRule.getFatchOrder() != null && fatchRule.getFatchOrder() == 1) {
|
|
|
225
|
+// String reverseText = StrUtil.reverse(ocrText);
|
|
|
226
|
+// if (StrUtil.isEmpty(fatchRule.getStartContent()) && StrUtil.isEmpty(fatchRule.getEndContent())) {
|
|
|
227
|
+// fatchMap.put(fatchRule.getColumnName(), trimStr(text));
|
|
|
228
|
+// } else if(StrUtil.isEmpty(fatchRule.getStartContent())&&StrUtil.isNotEmpty(fatchRule.getEndContent())){
|
|
|
229
|
+// // 开始字段为空,结束字段不为空
|
|
|
230
|
+// String endReverseText = StrUtil.reverse(fatchRule.getEndContent());
|
|
|
231
|
+// int endContIndex = StrUtil.ordinalIndexOf(reverseText, endReverseText, fatchRule.getEndContentRepeatOrder());
|
|
|
232
|
+// if (endContIndex != -1) {
|
|
|
233
|
+// String substring = text.substring(0, endContIndex);
|
|
|
234
|
+// fatchMap.put(fatchRule.getColumnName(), trimStr(StrUtil.reverse(substring)));
|
|
|
235
|
+// }
|
|
|
236
|
+//
|
|
|
237
|
+//
|
|
|
238
|
+// }else if(StrUtil.isNotEmpty(fatchRule.getStartContent())&&StrUtil.isEmpty(fatchRule.getEndContent())){
|
|
|
239
|
+// // 开始字段不为空,结束字段为空
|
|
|
240
|
+// // 开始字段为空,结束字段不为空
|
|
|
241
|
+// String startReverseText = StrUtil.reverse(fatchRule.getStartContent());
|
|
|
242
|
+// int startContIndex = StrUtil.ordinalIndexOf(reverseText, startReverseText, fatchRule.getStartContentRepeatOrder());
|
|
|
243
|
+// if (startContIndex != -1) {
|
|
|
244
|
+// String substring = text.substring(0, startContIndex);
|
|
|
245
|
+// fatchMap.put(fatchRule.getColumnName(), trimStr(StrUtil.reverse(substring)));
|
|
|
246
|
+// }
|
|
|
247
|
+//
|
|
|
248
|
+// }else if(StrUtil.isNotEmpty(fatchRule.getStartContent())&&StrUtil.isNotEmpty(fatchRule.getEndContent())){
|
|
|
249
|
+// String startReverseText = StrUtil.reverse(fatchRule.getStartContent());
|
|
|
250
|
+// String endReverseText = StrUtil.reverse(fatchRule.getEndContent());
|
|
|
251
|
+// int startContIndex = StrUtil.ordinalIndexOf(reverseText, startReverseText, fatchRule.getStartContentRepeatOrder());
|
|
|
252
|
+// if (startContIndex != -1) {
|
|
|
253
|
+// // 结束字段不为空
|
|
|
254
|
+// int endContIndex = StrUtil.ordinalIndexOf(reverseText, endReverseText, fatchRule.getEndContentRepeatOrder());
|
|
|
255
|
+// if (endContIndex != -1 && (startContIndex + fatchRule.getStartContent().length()) <= text.length()) {
|
|
|
256
|
+// String substring = text.substring(startContIndex + fatchRule.getStartContent().length(), endContIndex);
|
|
|
257
|
+// // 去除\n
|
|
|
258
|
+// fatchMap.put(fatchRule.getColumnName(),trimStr(StrUtil.reverse(substring)));
|
|
|
259
|
+// }
|
|
|
260
|
+// }
|
|
|
261
|
+// }
|
|
|
262
|
+ } else {
|
|
|
263
|
+ // 开始为空结束为空
|
|
|
264
|
+ if (StrUtil.isEmpty(fatchRule.getStartContent()) && StrUtil.isEmpty(fatchRule.getEndContent())) {
|
|
|
265
|
+ fatchMap.put(fatchRule.getColumnName(), trimStr(text));
|
|
|
266
|
+ } else if (StrUtil.isNotEmpty(fatchRule.getStartContent())) {
|
|
|
267
|
+ int startContIndex = StrUtil.ordinalIndexOf(text, fatchRule.getStartContent(), fatchRule.getStartContentRepeatOrder());
|
|
|
268
|
+ if (startContIndex != -1) {
|
|
|
269
|
+ // 开始不为空结束为空
|
|
|
270
|
+ if (StrUtil.isEmpty(fatchRule.getEndContent())) {
|
|
|
271
|
+ if ((startContIndex + fatchRule.getStartContent().length()) <= text.length()) {
|
|
|
272
|
+ String substring = text.substring(startContIndex + fatchRule.getStartContent().length());
|
|
|
273
|
+ // 去除\n
|
|
|
274
|
+ fatchMap.put(fatchRule.getColumnName(), trimStr(substring));
|
|
|
275
|
+ }
|
|
222
|
276
|
|
|
223
|
|
- if (StrUtil.isNotEmpty(fatchRule.getStartContent()) && StrUtil.isNotEmpty(fatchRule.getEndContent())) {
|
|
224
|
|
- String s = StringUtils.substringBetween(ocrText, fatchRule.getStartContent(), fatchRule.getEndContent());
|
|
225
|
|
- if(StrUtil.isNotEmpty(s)){
|
|
226
|
|
- fatchMap.put(fatchRule.getColumnName(), StrUtil.trim(s));
|
|
227
|
|
- }else {
|
|
228
|
|
- fatchMap.put(fatchRule.getColumnName(),"");
|
|
229
|
|
- }
|
|
|
277
|
+ } else {
|
|
|
278
|
+ // 开始不为空结束不为空
|
|
|
279
|
+ int endContIndex = StrUtil.ordinalIndexOf(text, fatchRule.getEndContent(), fatchRule.getEndContentRepeatOrder());
|
|
|
280
|
+ if (endContIndex != -1 && (startContIndex + fatchRule.getStartContent().length()) <= text.length()) {
|
|
|
281
|
+ String substring = text.substring(startContIndex + fatchRule.getStartContent().length(), endContIndex);
|
|
|
282
|
+ // 去除\n
|
|
|
283
|
+ fatchMap.put(fatchRule.getColumnName(), trimStr(substring));
|
|
|
284
|
+ }
|
|
230
|
285
|
|
|
231
|
|
- }else if(StrUtil.isNotEmpty(fatchRule.getStartContent()) && StrUtil.isEmpty(fatchRule.getEndContent())){
|
|
232
|
|
- String s = StringUtils.substringAfter(ocrText,fatchRule.getStartContent());
|
|
233
|
|
- if(StrUtil.isNotEmpty(s)){
|
|
234
|
|
- fatchMap.put(fatchRule.getColumnName(), StrUtil.trim(s));
|
|
235
|
|
- }else {
|
|
236
|
|
- fatchMap.put(fatchRule.getColumnName(),"");
|
|
|
286
|
+ }
|
|
|
287
|
+//
|
|
|
288
|
+ }
|
|
|
289
|
+
|
|
|
290
|
+ } else if (StrUtil.isEmpty(fatchRule.getStartContent()) && StrUtil.isNotEmpty(fatchRule.getEndContent())) {
|
|
|
291
|
+ // 开始为空结束不为空
|
|
|
292
|
+ int endContIndex = StrUtil.ordinalIndexOf(text, fatchRule.getEndContent(), fatchRule.getEndContentRepeatOrder());
|
|
|
293
|
+ if (endContIndex != -1) {
|
|
|
294
|
+ String substring = text.substring(0, endContIndex);
|
|
|
295
|
+ fatchMap.put(fatchRule.getColumnName(), trimStr(substring));
|
|
|
296
|
+ }
|
|
237
|
297
|
}
|
|
238
|
298
|
}
|
|
239
|
299
|
}
|
|
|
@@ -241,4 +301,20 @@ public class OCRUtils {
|
|
241
|
301
|
|
|
242
|
302
|
|
|
243
|
303
|
}
|
|
|
304
|
+
|
|
|
305
|
+ /**
|
|
|
306
|
+ * 去除末尾空格
|
|
|
307
|
+ *
|
|
|
308
|
+ * @param substring
|
|
|
309
|
+ * @return
|
|
|
310
|
+ */
|
|
|
311
|
+ private static String trimStr(String substring) {
|
|
|
312
|
+ if (StrUtil.isNotEmpty(substring)) {
|
|
|
313
|
+ return StrUtil.trim(substring);
|
|
|
314
|
+ } else {
|
|
|
315
|
+ return "";
|
|
|
316
|
+ }
|
|
|
317
|
+
|
|
|
318
|
+ }
|
|
|
319
|
+
|
|
244
|
320
|
}
|