diff --git a/.env-ci b/.env-ci index 1cc42b5e3..91474760f 100644 --- a/.env-ci +++ b/.env-ci @@ -5,4 +5,4 @@ SQL_DATABASE=ianalyzer SQL_PASSWORD=ianalyzer CELERY_BROKER=redis://redis ES_HOST=elasticsearch -DATA_DIR=/ci-data \ No newline at end of file +DATA_DIR=/ci-data diff --git a/backend/visualization/field_stats.py b/backend/visualization/field_stats.py index 1fee67991..e43108b2c 100644 --- a/backend/visualization/field_stats.py +++ b/backend/visualization/field_stats.py @@ -50,3 +50,35 @@ def report_coverage(corpus_name): } +def cardinality_results(search_result): + return search_result['aggregations']['unique_category_count']['value'] + +def report_cardinality(corpus_name): + ''' + Returns a dict with the number of unique values for each field in the corpus + ''' + es_client = elasticsearch(corpus_name) + corpus_conf = CorpusConfiguration.objects.get(corpus__name=corpus_name) + cardinality_dict = {} + + query = { + "size": 0, + "aggs": { + "unique_category_count": { + "cardinality": { + "field": "PLACEHOLDER", + "precision_threshold": 10000 + } + } + } + } + + for field in corpus_conf.fields.all(): + if field.display_type != 'keyword': + cardinality_dict[field.name] = 0 + else: + query_for_field = query + query_for_field['aggs']['unique_category_count']['cardinality']['field'] = field.name + cardinality_dict[field.name] = cardinality_results(es_client.search(index=corpus_conf.es_index, body=query_for_field)) + + return cardinality_dict diff --git a/backend/visualization/tests/test_field_stats.py b/backend/visualization/tests/test_field_stats.py index e18e456b5..5da9089f7 100644 --- a/backend/visualization/tests/test_field_stats.py +++ b/backend/visualization/tests/test_field_stats.py @@ -1,4 +1,4 @@ -from visualization.field_stats import count_field, count_total, report_coverage +from visualization.field_stats import count_field, count_total, report_coverage, report_cardinality def test_count(small_mock_corpus, es_client, index_small_mock_corpus, small_mock_corpus_specs): @@ -20,3 +20,12 @@ def test_report(small_mock_corpus, es_client, index_small_mock_corpus, small_moc 'content': 1.0, 'genre': 1.0, } + +def test_cardinality(small_mock_corpus, es_client, index_small_mock_corpus, small_mock_corpus_specs): + report = report_cardinality(small_mock_corpus) + assert report == { + 'date': 0, + 'title': 0, + 'content': 0, + 'genre': 3 + } diff --git a/backend/visualization/urls.py b/backend/visualization/urls.py index 62a0ada49..6a16b520f 100644 --- a/backend/visualization/urls.py +++ b/backend/visualization/urls.py @@ -8,5 +8,6 @@ path('ngram', NgramView.as_view()), path('date_term_frequency', DateTermFrequencyView.as_view()), path('aggregate_term_frequency', AggregateTermFrequencyView.as_view()), - path('coverage/', FieldCoverageView.as_view()) + path('coverage/', FieldCoverageView.as_view()), + path('cardinality/', FieldCardinalityView.as_view()) ] diff --git a/backend/visualization/views.py b/backend/visualization/views.py index 02191bc66..4854aade5 100644 --- a/backend/visualization/views.py +++ b/backend/visualization/views.py @@ -6,7 +6,7 @@ from django.conf import settings from addcorpus.permissions import CanSearchCorpus from tag.permissions import CanSearchTags -from visualization.field_stats import report_coverage +from visualization.field_stats import report_coverage, report_cardinality from addcorpus.permissions import corpus_name_from_request from api.utils import check_json_keys @@ -181,3 +181,15 @@ def get(self, request, *args, **kwargs): corpus = corpus_name_from_request(request) report = report_coverage(corpus) return Response(report) + +class FieldCardinalityView(APIView): + ''' + Get the number of different values for each filed in a corpus + ''' + + permission_classes = [CanSearchCorpus] + + def get(self, request, *args, **kwargs): + corpus = corpus_name_from_request(request) + report = report_cardinality(corpus) + return Response(report) diff --git a/docker-compose.yaml b/docker-compose.yaml index 4f0082793..d80b8438d 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -60,12 +60,13 @@ services: - cluster.name=ianalizer-es-data-cluster - bootstrap.memory_lock=true - xpack.security.enabled=false + - xpack.security.http.ssl.enabled=false - logger.org.elasticsearch.discovery=ERROR - logger.org.elasticsearch.transport=ERROR - logger.org.elasticsearch.http=ERROR - logger.org.elasticsearch.cluster=ERROR - "ES_JAVA_OPTS=-Xms2g -Xmx2g" - - ELASTIC_PASSWORD=$ELASTIC_ROOT_PASSWORD + ulimits: memlock: soft: -1 diff --git a/frontend/src/app/corpus/corpus-info/corpus-info.component.html b/frontend/src/app/corpus/corpus-info/corpus-info.component.html index a2434e1a6..0b26a6e56 100644 --- a/frontend/src/app/corpus/corpus-info/corpus-info.component.html +++ b/frontend/src/app/corpus/corpus-info/corpus-info.component.html @@ -32,7 +32,8 @@ id="fields" title="Fields">
+ [coverage]="fieldCoverage ? fieldCoverage[field.name] : undefined" + [cardinality]="fieldCardinality ? fieldCardinality[field.name] : undefined">
diff --git a/frontend/src/app/corpus/corpus-info/corpus-info.component.ts b/frontend/src/app/corpus/corpus-info/corpus-info.component.ts index aafda98f0..d9d7bc161 100644 --- a/frontend/src/app/corpus/corpus-info/corpus-info.component.ts +++ b/frontend/src/app/corpus/corpus-info/corpus-info.component.ts @@ -1,6 +1,6 @@ import { Component, OnInit } from '@angular/core'; import { ApiService, CorpusService } from '@services'; -import { Corpus, CorpusDocumentationPage, FieldCoverage } from '@models'; +import { Corpus, CorpusDocumentationPage, FieldCardinality, FieldCoverage } from '@models'; import { marked } from 'marked'; import { Observable } from 'rxjs'; import { Title } from '@angular/platform-browser'; @@ -18,6 +18,7 @@ export class CorpusInfoComponent implements OnInit { corpus: Corpus; fieldCoverage: FieldCoverage; + fieldCardinality: FieldCardinality; documentation$: Observable; @@ -40,6 +41,9 @@ export class CorpusInfoComponent implements OnInit { this.apiService.fieldCoverage(corpus.name).then( result => this.fieldCoverage = result ); + this.apiService.fieldCardinality(corpus.name).then( + result => this.fieldCardinality = result + ); this.title.setTitle(pageTitle(`About ${corpus.title}`)); } diff --git a/frontend/src/app/corpus/corpus-info/field-info/field-info.component.html b/frontend/src/app/corpus/corpus-info/field-info/field-info.component.html index c50735816..d70b9af9b 100644 --- a/frontend/src/app/corpus/corpus-info/field-info/field-info.component.html +++ b/frontend/src/app/corpus/corpus-info/field-info/field-info.component.html @@ -36,9 +36,21 @@

{{field.displayName}}

{{coveragePercentage}}% of the documents in this corpus have a value for this field

-

Loading coverage data...

+ + @switch(cardinality) { + @case(undefined) { +

Loading cardinality data...

+ } + @case(0) { +

Unique values are only calculated for keyword fields, not free text fields or dates.

+ } + @default { +

There are {{cardinality}} unique values for this field.

+ } + } + diff --git a/frontend/src/app/corpus/corpus-info/field-info/field-info.component.ts b/frontend/src/app/corpus/corpus-info/field-info/field-info.component.ts index 179d37ecb..db9bcd240 100644 --- a/frontend/src/app/corpus/corpus-info/field-info/field-info.component.ts +++ b/frontend/src/app/corpus/corpus-info/field-info/field-info.component.ts @@ -11,6 +11,7 @@ import * as _ from 'lodash'; export class FieldInfoComponent implements OnInit { @Input() field: CorpusField; @Input() coverage: number; + @Input() cardinality: number; mappingNames = { text: 'text', diff --git a/frontend/src/app/filter/multiple-choice-filter/multiple-choice-filter.component.html b/frontend/src/app/filter/multiple-choice-filter/multiple-choice-filter.component.html index deb3764dc..b5ad543cf 100644 --- a/frontend/src/app/filter/multiple-choice-filter/multiple-choice-filter.component.html +++ b/frontend/src/app/filter/multiple-choice-filter/multiple-choice-filter.component.html @@ -1,8 +1,18 @@
- + +
{{item.label}}
{{item.doc_count}}
diff --git a/frontend/src/app/filter/multiple-choice-filter/multiple-choice-filter.component.scss b/frontend/src/app/filter/multiple-choice-filter/multiple-choice-filter.component.scss index 5a368d9d5..4226aad72 100644 --- a/frontend/src/app/filter/multiple-choice-filter/multiple-choice-filter.component.scss +++ b/frontend/src/app/filter/multiple-choice-filter/multiple-choice-filter.component.scss @@ -14,7 +14,10 @@ .select-label { margin-top: 2px; white-space: pre-line; - max-width: 180px; vertical-align: top; flex-grow: 2; } + +::ng-deep .p-multiselect-option { + height: auto !important; +} \ No newline at end of file diff --git a/frontend/src/app/filter/multiple-choice-filter/multiple-choice-filter.component.ts b/frontend/src/app/filter/multiple-choice-filter/multiple-choice-filter.component.ts index 0563c21e2..d98807908 100644 --- a/frontend/src/app/filter/multiple-choice-filter/multiple-choice-filter.component.ts +++ b/frontend/src/app/filter/multiple-choice-filter/multiple-choice-filter.component.ts @@ -6,6 +6,7 @@ import { TermsAggregator, TermsResult } from '@models/aggregation'; import { SearchService } from '@services'; import { MultipleChoiceFilter, MultipleChoiceFilterOptions } from '@models'; import { BaseFilterComponent } from '../base-filter.component'; +import { MultiSelectLazyLoadEvent } from 'primeng/multiselect'; @Component({ selector: 'ia-multiple-choice-filter', @@ -15,6 +16,7 @@ import { BaseFilterComponent } from '../base-filter.component'; }) export class MultipleChoiceFilterComponent extends BaseFilterComponent { options: { label: string; value: string; doc_count: number }[] = []; + allOptionsCalled: boolean = false; constructor(private searchService: SearchService) { super(); @@ -25,12 +27,26 @@ export class MultipleChoiceFilterComponent extends BaseFilterComponent 10) { + this.getOptions(true); + this.allOptionsCalled = true; + } } - private async getOptions(): Promise { + private async getOptions(all: boolean = false): Promise { + console.log('fire'); if (this.filter && this.queryModel) { - const optionCount = (this.filter.corpusField.filterOptions as MultipleChoiceFilterOptions).option_count; + // optionCount is set to the maximum when the filter panel is shown, but not when other filters change + const optionCount = all ? 10000 : (this.filter.corpusField.filterOptions as MultipleChoiceFilterOptions).option_count; const aggregator = new TermsAggregator(this.filter.corpusField, optionCount); const queryModel = this.queryModel.clone(); queryModel.filterForField(this.filter.corpusField).deactivate(); diff --git a/frontend/src/app/models/visualization.ts b/frontend/src/app/models/visualization.ts index f330d4b25..53ff7a4ae 100644 --- a/frontend/src/app/models/visualization.ts +++ b/frontend/src/app/models/visualization.ts @@ -113,3 +113,8 @@ export interface ChartParameters { export interface FieldCoverage { [field: string]: number; }; + +/** number of unique values for that field */ +export interface FieldCardinality { + [field: string]: number; +} diff --git a/frontend/src/app/services/api.service.ts b/frontend/src/app/services/api.service.ts index d3fc870de..23a205660 100644 --- a/frontend/src/app/services/api.service.ts +++ b/frontend/src/app/services/api.service.ts @@ -2,7 +2,7 @@ import { Injectable } from '@angular/core'; import { HttpClient, HttpParams } from '@angular/common/http'; -import { interval, Observable } from 'rxjs'; +import { firstValueFrom, interval, Observable } from 'rxjs'; import { filter, switchMap, take, takeUntil } from 'rxjs/operators'; import { AggregateTermFrequencyParameters, @@ -13,6 +13,7 @@ import { Download, DownloadOptions, FieldCoverage, + FieldCardinality, FoundDocument, GeoDocument, GeoLocation, @@ -185,6 +186,11 @@ export class ApiService { return this.http.get(url).toPromise(); } + fieldCardinality(corpusName: string): Promise { + const url = this.apiRoute(this.visApiURL, `cardinality/${corpusName}`); + return firstValueFrom(this.http.get(url)); + } + // Download public requestFullData( data: