Skip to content
Merged
2 changes: 1 addition & 1 deletion .env-ci
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ SQL_DATABASE=ianalyzer
SQL_PASSWORD=ianalyzer
CELERY_BROKER=redis://redis
ES_HOST=elasticsearch
DATA_DIR=/ci-data
DATA_DIR=/ci-data
32 changes: 32 additions & 0 deletions backend/visualization/field_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,35 @@ def report_coverage(corpus_name):
}


def cardinality_results(search_result):
return search_result['aggregations']['unique_category_count']['value']

def report_cardinality(corpus_name):
'''
Returns a dict with the number of unique values for each field in the corpus
'''
es_client = elasticsearch(corpus_name)
corpus_conf = CorpusConfiguration.objects.get(corpus__name=corpus_name)
cardinality_dict = {}

query = {
"size": 0,
"aggs": {
"unique_category_count": {
"cardinality": {
"field": "PLACEHOLDER",
"precision_threshold": 10000
}
}
}
}

for field in corpus_conf.fields.all():
if field.display_type != 'keyword':
cardinality_dict[field.name] = 0
else:
query_for_field = query
query_for_field['aggs']['unique_category_count']['cardinality']['field'] = field.name
cardinality_dict[field.name] = cardinality_results(es_client.search(index=corpus_conf.es_index, body=query_for_field))

return cardinality_dict
11 changes: 10 additions & 1 deletion backend/visualization/tests/test_field_stats.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from visualization.field_stats import count_field, count_total, report_coverage
from visualization.field_stats import count_field, count_total, report_coverage, report_cardinality


def test_count(small_mock_corpus, es_client, index_small_mock_corpus, small_mock_corpus_specs):
Expand All @@ -20,3 +20,12 @@ def test_report(small_mock_corpus, es_client, index_small_mock_corpus, small_moc
'content': 1.0,
'genre': 1.0,
}

def test_cardinality(small_mock_corpus, es_client, index_small_mock_corpus, small_mock_corpus_specs):
report = report_cardinality(small_mock_corpus)
assert report == {
'date': 0,
'title': 0,
'content': 0,
'genre': 3
}
3 changes: 2 additions & 1 deletion backend/visualization/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,6 @@
path('ngram', NgramView.as_view()),
path('date_term_frequency', DateTermFrequencyView.as_view()),
path('aggregate_term_frequency', AggregateTermFrequencyView.as_view()),
path('coverage/<str:corpus>', FieldCoverageView.as_view())
path('coverage/<str:corpus>', FieldCoverageView.as_view()),
path('cardinality/<str:corpus>', FieldCardinalityView.as_view())
]
14 changes: 13 additions & 1 deletion backend/visualization/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from django.conf import settings
from addcorpus.permissions import CanSearchCorpus
from tag.permissions import CanSearchTags
from visualization.field_stats import report_coverage
from visualization.field_stats import report_coverage, report_cardinality
from addcorpus.permissions import corpus_name_from_request
from api.utils import check_json_keys

Expand Down Expand Up @@ -181,3 +181,15 @@ def get(self, request, *args, **kwargs):
corpus = corpus_name_from_request(request)
report = report_coverage(corpus)
return Response(report)

class FieldCardinalityView(APIView):
'''
Get the number of different values for each filed in a corpus
'''

permission_classes = [CanSearchCorpus]

def get(self, request, *args, **kwargs):
corpus = corpus_name_from_request(request)
report = report_cardinality(corpus)
return Response(report)
3 changes: 2 additions & 1 deletion docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,13 @@ services:
- cluster.name=ianalizer-es-data-cluster
- bootstrap.memory_lock=true
- xpack.security.enabled=false
- xpack.security.http.ssl.enabled=false
- logger.org.elasticsearch.discovery=ERROR
- logger.org.elasticsearch.transport=ERROR
- logger.org.elasticsearch.http=ERROR
- logger.org.elasticsearch.cluster=ERROR
- "ES_JAVA_OPTS=-Xms2g -Xmx2g"
- ELASTIC_PASSWORD=$ELASTIC_ROOT_PASSWORD

ulimits:
memlock:
soft: -1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@
id="fields" title="Fields">
<div class="block" *ngFor="let field of corpus.fields">
<ia-field-info [field]="field"
[coverage]="fieldCoverage ? fieldCoverage[field.name] : undefined">
[coverage]="fieldCoverage ? fieldCoverage[field.name] : undefined"
[cardinality]="fieldCardinality ? fieldCardinality[field.name] : undefined">
</ia-field-info>
</div>
</ng-template>
Expand Down
6 changes: 5 additions & 1 deletion frontend/src/app/corpus/corpus-info/corpus-info.component.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { Component, OnInit } from '@angular/core';
import { ApiService, CorpusService } from '@services';
import { Corpus, CorpusDocumentationPage, FieldCoverage } from '@models';
import { Corpus, CorpusDocumentationPage, FieldCardinality, FieldCoverage } from '@models';
import { marked } from 'marked';
import { Observable } from 'rxjs';
import { Title } from '@angular/platform-browser';
Expand All @@ -18,6 +18,7 @@ export class CorpusInfoComponent implements OnInit {
corpus: Corpus;

fieldCoverage: FieldCoverage;
fieldCardinality: FieldCardinality;

documentation$: Observable<CorpusDocumentationPage[]>;

Expand All @@ -40,6 +41,9 @@ export class CorpusInfoComponent implements OnInit {
this.apiService.fieldCoverage(corpus.name).then(
result => this.fieldCoverage = result
);
this.apiService.fieldCardinality(corpus.name).then(
result => this.fieldCardinality = result
);
this.title.setTitle(pageTitle(`About ${corpus.title}`));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,21 @@ <h3 class="title is-5">{{field.displayName}}</h3>
<p *ngIf="coverage !== undefined">
<b>{{coveragePercentage}}%</b> of the documents in this corpus have a value for this field
</p>

<p *ngIf="coverage === undefined">
Loading coverage data...
</p>

@switch(cardinality) {
@case(undefined) {
<p>Loading cardinality data...</p>
}
@case(0) {
<p>Unique values are only calculated for keyword fields, not free text fields or dates.</p>
}
@default {
<p>There are <b>{{cardinality}}</b> unique values for this field.</p>
}
}

</div>
</details>
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import * as _ from 'lodash';
export class FieldInfoComponent implements OnInit {
@Input() field: CorpusField;
@Input() coverage: number;
@Input() cardinality: number;

mappingNames = {
text: 'text',
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,18 @@
<div class="mc">
<p-multiSelect [id]="filter.corpusField.name" [disabled]="!options.length" [filter]="options.length>=6" [options]="options" [maxSelectedLabels]=1
placeholder="Choose" [ngModel]="data" (onChange)="update($event.value)"
ariaLabelledBy="legend-filter-{{filter.displayName | slugify}}"
fluid>
<p-multiSelect

[id]="filter.corpusField.name"
[disabled]="!options.length"
[filter]="options.length>=6"
[options]="options"
[virtualScroll]="true"
[virtualScrollItemSize]="60"
[maxSelectedLabels]=1

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why set this?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[virtualscroll]: necessary to virtualscroll
[virtualScrollItemSize]: obligatory field, is overwritten in CSS (should we make that clear somewhere?)
[lazy]: can be and will be removed, good catch
[maxSelectedLabels]: this ensures that if more than 1 label (filter) is selected, it shows '2 items selected' instead of the labelnames.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know what maxSelectedLabels means, but why did you change it here?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did not actually change it, I just put it on a newline :)

placeholder="Choose"
[ngModel]="data"
(onChange)="update($event.value)" ariaLabelledBy="legend-filter-{{filter.displayName | slugify}}" fluid
(onPanelShow)="getAllOptionsFromES($event)">

<ng-template let-item pTemplate="item">
<div class="select-label">{{item.label}}</div>
<div class="select-count">{{item.doc_count}}</div>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@
.select-label {
margin-top: 2px;
white-space: pre-line;
max-width: 180px;
vertical-align: top;
flex-grow: 2;
}

::ng-deep .p-multiselect-option {
height: auto !important;
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import { TermsAggregator, TermsResult } from '@models/aggregation';
import { SearchService } from '@services';
import { MultipleChoiceFilter, MultipleChoiceFilterOptions } from '@models';
import { BaseFilterComponent } from '../base-filter.component';
import { MultiSelectLazyLoadEvent } from 'primeng/multiselect';

@Component({
selector: 'ia-multiple-choice-filter',
Expand All @@ -15,6 +16,7 @@ import { BaseFilterComponent } from '../base-filter.component';
})
export class MultipleChoiceFilterComponent extends BaseFilterComponent<MultipleChoiceFilter> {
options: { label: string; value: string; doc_count: number }[] = [];
allOptionsCalled: boolean = false;

constructor(private searchService: SearchService) {
super();
Expand All @@ -25,12 +27,26 @@ export class MultipleChoiceFilterComponent extends BaseFilterComponent<MultipleC
}

onQueryModelUpdate(): void {
this.getOptions();
if( this.allOptionsCalled ) {
this.getOptions(true);
}
this.getOptions(false);
}

/** Gets all the filter options from ES, only if there are more than 10 options for that filter */
getAllOptionsFromES(event:MultiSelectLazyLoadEvent) {
const optionCount = (this.filter.corpusField.filterOptions as MultipleChoiceFilterOptions).option_count;
if (optionCount > 10) {
this.getOptions(true);
this.allOptionsCalled = true;
}
}
Comment on lines +37 to 43

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right now, every filter will make a new query request when the panel is opened, but this results in unnecessary extra requests for fields with only a few values - it's just fetching the data twice. Then for filters like speaker, it's the initial request that isn't necessary, since the list is not shown before this point.

My suggestion would be to use the option_count for this; if the option count is modest, you can eagerly fetch the options, the way we do currently. If it's very high, you delay the request until the user has at least opened the panel once. After that, you can refetch as normal.

(In the future, we might transition the option count to a boolean.)


private async getOptions(): Promise<void> {
private async getOptions(all: boolean = false): Promise<void> {
console.log('fire');
if (this.filter && this.queryModel) {
const optionCount = (this.filter.corpusField.filterOptions as MultipleChoiceFilterOptions).option_count;
// optionCount is set to the maximum when the filter panel is shown, but not when other filters change
const optionCount = all ? 10000 : (this.filter.corpusField.filterOptions as MultipleChoiceFilterOptions).option_count;
const aggregator = new TermsAggregator(this.filter.corpusField, optionCount);
const queryModel = this.queryModel.clone();
queryModel.filterForField(this.filter.corpusField).deactivate();
Expand Down
5 changes: 5 additions & 0 deletions frontend/src/app/models/visualization.ts
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,8 @@ export interface ChartParameters {
export interface FieldCoverage {
[field: string]: number;
};

/** number of unique values for that field */
export interface FieldCardinality {
[field: string]: number;
}
8 changes: 7 additions & 1 deletion frontend/src/app/services/api.service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import { Injectable } from '@angular/core';

import { HttpClient, HttpParams } from '@angular/common/http';
import { interval, Observable } from 'rxjs';
import { firstValueFrom, interval, Observable } from 'rxjs';
import { filter, switchMap, take, takeUntil } from 'rxjs/operators';
import {
AggregateTermFrequencyParameters,
Expand All @@ -13,6 +13,7 @@ import {
Download,
DownloadOptions,
FieldCoverage,
FieldCardinality,
FoundDocument,
GeoDocument,
GeoLocation,
Expand Down Expand Up @@ -185,6 +186,11 @@ export class ApiService {
return this.http.get<FieldCoverage>(url).toPromise();
}

fieldCardinality(corpusName: string): Promise<FieldCardinality> {
const url = this.apiRoute(this.visApiURL, `cardinality/${corpusName}`);
return firstValueFrom(this.http.get<FieldCardinality>(url));
}

// Download
public requestFullData(
data:
Expand Down
Loading