Python源码示例:elasticsearch.helpers()

示例1
def sentences(self, exclude_ids=None, query=None, return_estnltk_object=True, **kwargs):
        if query is None:
            query = {}

        if return_estnltk_object:
            if query.get('fields', None) is None:
                query['fields'] = ['estnltk_text_object']
            else:
                if 'estnltk_text_object' not in query['fields']:
                    raise AssertionError('Query contained the "fields" parameter without the "estnltk_text_object" argument'
                                         'Consider setting the "return_estnltk_object" parameter to False to disable respose handling')
                pass

        if exclude_ids is None:
            for document in elasticsearch.helpers.scan(self.client, query=query, doc_type='sentence', **kwargs):
                if return_estnltk_object:
                    yield Text(json.loads(document['fields']['estnltk_text_object'][0]))
                else:
                    yield json.loads(document)
        else:
            raise NotImplementedError('ID exclusion is not implemented') 
示例2
def next_bulk_action_batch(self, document_iterator):
        """
        Read a batch of documents from the iterator and convert them into bulk index actions.

        Elasticsearch expects each document to actually be transmitted on two lines the first of which details the
        action to take, and the second contains the actual document.

        See the `Cheaper in Bulk <https://www.elastic.co/guide/en/elasticsearch/guide/1.x/bulk.html>`_ guide.

        Arguments:
            document_iterator (iterator of dicts):

        Returns: A list of dicts that can be transmitted to elasticsearch using the "bulk" request.
        """
        bulk_action_batch = []
        for raw_data in islice(document_iterator, self.batch_size):
            action, data = elasticsearch.helpers.expand_action(raw_data)
            bulk_action_batch.append(action)
            if data is not None:
                bulk_action_batch.append(data)
        return bulk_action_batch 
示例3
def store_in_elasticsearch(so_it, dry_run, es, index, workers_write, queue_write):
        #write into elasticsearch
        chunk_size = 1000 #TODO make configurable
        actions = elasticsearch_actions(so_it, dry_run, index)
        failcount = 0

        if not dry_run:
            results = None
            if workers_write > 0:
                results = elasticsearch.helpers.parallel_bulk(es, actions,
                        thread_count=workers_write,
                        queue_size=queue_write, 
                        chunk_size=chunk_size)
            else:
                results = elasticsearch.helpers.streaming_bulk(es, actions,
                        chunk_size=chunk_size)
            for success, details in results:
                if not success:
                    failcount += 1

            if failcount:
                raise RuntimeError("%s relations failed to index" % failcount) 
示例4
def store_in_elasticsearch(results, es, dry_run, workers_write, queue_write, index):
    chunk_size = 1000 #TODO make configurable
    actions = elasticsearch_actions(results, dry_run, index)
    failcount = 0

    if not dry_run:
        results = None
        if workers_write > 0:
            results = elasticsearch.helpers.parallel_bulk(es, actions,
                    thread_count=workers_write,
                    queue_size=queue_write, 
                    chunk_size=chunk_size)
        else:
            results = elasticsearch.helpers.streaming_bulk(es, actions,
                    chunk_size=chunk_size)
        for success, details in results:
            if not success:
                failcount += 1

        if failcount:
            raise RuntimeError("%s relations failed to index" % failcount) 
示例5
def get_target_labels(ids, es, index):
    res = elasticsearch.helpers.scan(client=es,
            query={"query": {
                "ids": {
                    "values": ids,
                }
            },
                '_source': 'approved_symbol',
                'size': 1,
            },
            index=index
        )



    return dict((hit['_id'],hit['_source']['approved_symbol']) for hit in res) 
示例6
def get_nulldata_transactions(self, index):
            # This is a mess. Apologies if you're looking at this

            query = { "_source": ["hash",
                                  "height",
                                  "txid",
                                  "vin.txid",
                                  "vout.scriptPubKey.asm",
                                  "vout.scriptPubKey.type",
                                  "vout.n"
                                 ],
                      "query" : {
                        "bool": {
                          "must": [
                            {"term": { "vout.scriptPubKey.type": "nulldata" }}
                          ]
                        }
                      }
                    }

            return elasticsearch.helpers.scan(self.es, index=index, query=query, scroll='5m') 
示例7
def get_opreturn_data(self, bottom = None, top = None):

            query = { "_source": ["tx",
                                  "height",
                                  "n",
                                  "txid",
                                  "vin.txid",
                                 ],
                      "query" : {
                        "match_all" : {}
                      }
                    }

            if bottom is not None and top is not None:
                query['query'] = {"range" : { "height" : { "gte" : bottom, "lte" :  top}}}

            return elasticsearch.helpers.scan(self.es, index="btc-opreturn", query=query, size=100, scroll='1m') 
示例8
def get_elasticsearch_helper(self):
    """
    Get helpers module for Elasticsearch. Used to bulk index documents.

    :returns: package ``elasticsearch.helpers``
    """
    return helpers 
示例9
def bulk_index(self, index, doc_type, items):
        # TODO #653: Remove version-specific support for metrics stores before 7.0.0.
        import elasticsearch.helpers
        if self._cluster_version[0] > 6:
            self.guarded(elasticsearch.helpers.bulk, self._client, items, index=index, chunk_size=5000)
        else:
            self.guarded(elasticsearch.helpers.bulk, self._client, items, index=index, doc_type=doc_type, chunk_size=5000) 
示例10
def main():

    if len(sys.argv) > 1:
        input_file = sys.argv[1]
    else:
        print("Usage: %s <nvd-xml-file>" % (sys.argv[0]))
        sys.exit(1)

    # First let's see if the index exists
    if es.indices.exists('cve-index') is False:
        # We have to create it and add a mapping
        fh = open('cve-index-json-mapping.json')
        mapping = json.load(fh)
        es.indices.create('cve-index', body=mapping)

    fh = open(input_file)
    json_data = json.load(fh)

    the_cves = CVE()
    for i in json_data['CVE_Items']:

        # ['CVE_Items'][0]['cve']['CVE_data_meta']['ID']
        the_cves.add(i)
        #es.update(id=cve_id, index="cve-index", body={'doc' : cve, 'doc_as_upsert': True})


    for ok, item in elasticsearch.helpers.streaming_bulk(es, the_cves, max_retries=2):
            if not ok:
                print("ERROR:")
                print(item) 
示例11
def validation_on_start(eco_scores_uri, schema_uri, excluded_biotypes, 
        datasources_to_datatypes, es_hosts, es_index_gene, es_index_eco, es_index_efo,
        cache_target, cache_target_u2e, cache_target_contains,
        cache_eco, cache_efo, cache_efo_contains):
    logger = logging.getLogger(__name__)

    validator = opentargets_validator.helpers.generate_validator_from_schema(schema_uri)

    lookup_data = LookUpDataRetriever(new_es_client(es_hosts), 
        gene_index=es_index_gene,
        gene_cache_size = cache_target,
        gene_cache_u2e_size = cache_target_u2e,
        gene_cache_contains_size = cache_target_contains,
        eco_index=es_index_eco,
        eco_cache_size = cache_efo_contains,
        efo_index=es_index_efo,
        efo_cache_size = cache_efo,
        efo_cache_contains_size = cache_efo_contains
        ).lookup


    datasources_to_datatypes = datasources_to_datatypes
    evidence_manager = EvidenceManager(lookup_data, eco_scores_uri, 
        excluded_biotypes, datasources_to_datatypes)

    return logger, validator, lookup_data, datasources_to_datatypes, evidence_manager 
示例12
def store(self, es, dry_run, data):
        self.logger.info("Starting drug storage")
        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings):
            #write into elasticsearch
            chunk_size = 1000 #TODO make configurable
            actions = elasticsearch_actions(list(data.items()), self.es_index)
            failcount = 0
            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(es, actions,
                            thread_count=self.workers_write,
                            queue_size=self.queue_write, 
                            chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(es, actions,
                            chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" % failcount)
        
        self.logger.debug("Completed storage") 
示例13
def _store_eco(self, dry_run):

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        es = new_es_client(self.es_hosts)
        with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings):

            #write into elasticsearch
            chunk_size = 1000 #TODO make configurable
            actions = elasticsearch_actions(list(self.ecos.items()), self.es_index)
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(es, actions,
                            thread_count=self.workers_write,
                            queue_size=self.queue_write, 
                            chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(es, actions,
                            chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" % failcount) 
示例14
def store_data(self, dry_run):
        self.logger.info('store_data called')

        self.logger.debug('calling to create new expression index')

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        es = new_es_client(self.es_hosts)
        with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings):
  
            #write into elasticsearch
            chunk_size = 1000 #TODO make configurable
            actions = elasticsearch_actions(self.hpa_merged_table, dry_run, self.es_index)
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(es, actions,
                            thread_count=self.workers_write,
                            queue_size=self.queue_write, 
                            chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(es, actions,
                            chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" % failcount)
        
        if failcount:
            raise RuntimeError("%s failed to index" % failcount)

        self.logger.info('missing tissues %s', str(_missing_tissues)) 
示例15
def get_disease_labels(ids, es, index):
    res = elasticsearch.helpers.scan(client=es,
            query={
                "query": {
                    "ids": {
                        "values": ids,
                    }
                },
                '_source': 'label',
                'size': 1,
            },
            index=index
        )

    return dict((hit['_id'],hit['_source']['label']) for hit in res) 
示例16
def get_nonstandard_transactions(self):
            query = { "_source": ["hash", "vout.scriptPubKey.hex", "vout.scriptPubKey.type"], "query" : { "match": { "vout.scriptPubKey.type": "nonstandard" } } }

            return elasticsearch.helpers.scan(self.es, index="btc-transactions-*", query=query, scroll='1m') 
示例17
def update_opreturns(self, the_iter):

        errors = []

        for ok, item in elasticsearch.helpers.streaming_bulk(self.es, the_iter, max_retries=2):
            if not ok:
                errors.append(item)

        return errors 
示例18
def add_opreturn_files(self, data):
        errors = []

        for ok, item in elasticsearch.helpers.streaming_bulk(self.es, data, max_retries=2):
            if not ok:
                errors.append(item)

        return errors 
示例19
def process_all(self, dry_run):

        self.g.add_node('root', name="", species="")

        for row in self.downloader.get_pathway_data():
            self.g.add_node(row['id'], name=row['name'], species=row['species'])
        children = set()
        for row in self.downloader.get_pathway_relations():
            self.g.add_edge(row['id'], row['child'])
            children.add(row['child'])

        nodes_without_parent = set(self.g.nodes()) - children
        for node in nodes_without_parent:
            if node != 'root':
                self.g.add_edge('root', node)

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        es = new_es_client(self.es_hosts)
        with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings):
            #write into elasticsearch
            chunk_size = 1000 #TODO make configurable
            docs = generate_documents(self.g)
            actions = elasticsearch_actions(docs, self.es_index)
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(es, actions,
                            thread_count=self.workers_write,
                            queue_size=self.queue_write, 
                            chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(es, actions,
                            chunk_size=chunk_size)
                for success, _ in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s failed to index" % failcount) 
示例20
def get_disease_to_targets_vectors(threshold, evidence_count, es, index):
    '''
    Get all the association objects that are:
    - direct -> to avoid ontology inflation
    - > 3 evidence count -> remove noise
    - overall score > threshold -> remove very lo quality noise
    :param threshold: minimum overall score threshold to consider for fetching association data
    :param evidence_count: minimum number of evidence consider for fetching association data
    :return: two dictionaries mapping target to disease  and the reverse
    '''
    res = elasticsearch.helpers.scan(client=es,
            query={
                "query": {
                    "term": {
                        "is_direct": True,
                    }
                },
                '_source': {
                    'includes':
                        ["target.id", 'disease.id', 'harmonic-sum', 'evidence_count']},
                'size': 1000,
            },
            index=index
        )

    target_results = dict()
    disease_results = dict()

    c=0
    for hit in res:
        c+=1
        pair_id = str(hit['_id'])
        hit = hit['_source']
        if hit['evidence_count']['total']>= evidence_count and \
            hit['harmonic-sum']['overall'] >= threshold:
            '''store target associations'''
            if hit['target']['id'] not in target_results:
                target_results[hit['target']['id']] = SparseFloatDict()
            #TODO: return all counts and scores up to datasource level
            target_results[hit['target']['id']][hit['disease']['id']]=hit['harmonic-sum']['overall']

            '''store disease associations'''
            if hit['disease']['id'] not in disease_results:
                disease_results[hit['disease']['id']] = SparseFloatDict()
            # TODO: return all counts and scores up to datasource level
            disease_results[hit['disease']['id']][hit['target']['id']] = hit['harmonic-sum']['overall']

    return target_results, disease_results