Read sentences from text file python
Your regex works on the text above if you do this: Show
The only problem is, the regex splits on the dot in "Mr." from your text above, so you need to fix/change that. One solution to this, though not perfect, is you could take out all occurences of a dot after Mr:
this Matches an 'M' followed by minimum 1, maximum 2 alphanumeric chars(\w{1,3}), followed by a dot. The parenthesised part of the pattern is grouped and captured, and it's referenced in the replacement as '\1'(or group 1, as you could have more parenthesised groups). So essentially, the Mr. or Mrs. is matched, but only the Mr or Mrs part is captured, and the Mr. or Mrs. is then replaced by the captured part which excludes the dot. and then :
will work the way you want. 18 Python code examples are found related to " read sentences". You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. Example 1 def read_sentences_from_file(path_to_file, one_sentence_per_line=True): lines = [] with io.open(path_to_file, mode="r", encoding="utf-8") as file: for line in file: line = line.strip() if line != "": lines.append(line.strip()) if one_sentence_per_line: sentences = lines else: text = " ".join(lines) sentences = list(split_multi(text)) sentences = [sentence for sentence in sentences if sentence != ""] return sentences ##### Printing / writing helpers ##### Example 2 def read_sentences(path, vocab, is_train, repr="word", ngram_size=3, test_vocab=None): questions = [] max_len = 0 with codecs.open(path, "r", "UTF-8") as f: for i, line in enumerate(f): q_tokens = split_sent(normalize_unicode(line.strip()), repr, ngram_size) token_ids = [] if len(q_tokens) > max_len: max_len = len(q_tokens) for token in q_tokens: if token not in vocab[repr]: if is_train: vocab[repr][token] = len(vocab[repr]) elif repr == "word" and token not in test_vocab[repr]: test_vocab[repr][token] = len(vocab[repr]) + len(test_vocab[repr]) if token in vocab[repr]: token_ids.append(vocab[repr][token]) elif repr == "word": token_ids.append(test_vocab[repr][token]) else: token_ids.append(OOV_WORD_INDEX) questions.append(token_ids) return questions, max_len Example 3 def read_sentences(self, tagged_sentences): # Preprocessing: Separate sentences, and output different arrays for words and tags. num_sentences = len(tagged_sentences) all_words = [] all_pos_tags = [] maxsentlen = 0 for tagged_sentence in tagged_sentences: words = [] pos_tags = [] # Expects each token to be a "_" separated combination of word and POS tag. for word_tag in tagged_sentence.split(" "): word, tag = word_tag.split("_") word = word.lower() words.append(word) pos_tags.append(tag) sentlen = len(words) if sentlen > maxsentlen: maxsentlen = sentlen all_words.append(words) all_pos_tags.append(pos_tags) return maxsentlen, all_words, all_pos_tags Example 4 def read_nbest_sentences(nbest_path): f_nbest = smart_open(nbest_path, 'r') index = 0 nbest_sentences = [] nbest_per_sentence = [] for line in f_nbest: line = line.strip() pieces = line.split(' ||| ') if int(pieces[0]) == index: nbest_per_sentence.append(pieces[1]) else: nbest_sentences.append(nbest_per_sentence) nbest_per_sentence = [] nbest_per_sentence.append(pieces[1]) index = index+1 nbest_sentences.append(nbest_per_sentence) f_nbest.close() return nbest_sentences Example 5 def read_sentences(stanford_file_name, file_id): stanford_file = codecs.open(stanford_file_name, 'r', 'utf-8') sentences = [] raw_sentences = [] tokens = [] text_line = '' state_line = '' sent_offset = 0 state = False state1 = False for line in stanford_file: if line.startswith('Sentence #'): if state: sentences.append(asent.Sentence(tokens)) sentences[-1].offset = sent_offset sentences[-1].raw_txt = text_line sentences[-1].file_id = file_id text_line = '' state_line = '' tokens = [] state = False state1 = False elif len(line) > 1 and line[-2]==']' and (state or line.startswith('[Text=')): if state_line: token = asent.Token.parse_stanford_line(state_line + ' ' + line[:-2], {}) else: token = asent.Token.parse_stanford_line(line[1:-2], {}) if not state1: sent_offset = token.char_start ind_start = token.char_start - sent_offset ind_end = token.char_end - sent_offset token.reset_char_spans(ind_start, ind_end) word = token.original_word word = word.replace(u"\u00A0", "_") if '_' in word: split_word = word.split('_') split_inds = filter(lambda x: word[x] == '_', range(len(word))) first_word = word[:split_inds[0]] token.original_word = first_word token.word = first_word if normalize_ne: token.pred_lexeme = first_word.lower() else: token.pred_lexeme = first_word.lower() + u'/' + token.pos.lower() token.const_lexeme = first_word token.char_end = token.char_start + split_inds[0] tokens.append(token) for j, w in enumerate(split_word[1:]): char_start = token.char_start + split_inds[j] + 1 if j + 1 < len(split_inds): char_end = token.char_start + split_inds[j+1] else: char_end = token.char_start + len(word) new_token = asent.Token(w, w, token.pos, token.constant_label, token.is_ne, token.is_timex, token.ne_tag, token.normalized_ne_tag, char_start=char_start, char_end=char_end) tokens.append(new_token) else: tokens.append(token) state = True state1 = True elif line.startswith('[Text='): state_line = line[1:].strip() state = True else: #if line.strip(): if state: state_line += ' ' + line.strip() else: text_line += line.replace('\n', ' ') if state: sentences.append(asent.Sentence(tokens)) sentences[-1].offset = sent_offset sentences[-1].raw_txt = text_line sentences[-1].file_id = file_id return sentences Example 6 def read_all_sentences(inpath): all_sentences = [] in_file = file(inpath, 'rt') for line in in_file: if line.startswith('<'): continue line = line.strip().lower() sentences = re.split('\t', line) for sentence in sentences: sentence = sentence.strip() all_sentences.append(sentence) in_file.close() return all_sentences Example 7 def read_conll_sentences(sent_file_name): sent_file = open(sent_file_name, 'r') sent_conll_in = [[line.split('\t') for line in sent.split('\n')] for sent in sent_file.read().split('\n\n')[:-1]] sentences = [] for sent_conll in sent_conll_in: sentences.append(Sentence.parse_conll(sent_conll)) return sentences Example 8 def read_conll_dep_sentences(sent_file_name): sent_file = open(sent_file_name, 'r') sent_conll_in = [[line.split('\t') for line in sent.split('\n')] for sent in sent_file.read().split('\n\n')[:-1]] sentences = [] for sent_conll in sent_conll_in: sentences.append(Sentence.parse_conll_dep(sent_conll)) return sentences Example 9 def read_all_sentences_labels(filenames, unit='viseme'): r""" Multi-file version of `read_sentence_labels` which prevents reading the ground truth file multiple times Parameters ---------- filenames unit Returns ------- """ if unit == 'viseme': transcript = viseme_file elif unit == 'phoneme': transcript = phoneme_file elif unit == 'character': transcript = character_file else: raise Exception('only `viseme`, `phoneme` and `character` unit transcriptions are supported') with open(transcript, 'r') as f: contents = f.read() labels = {} for filename in filenames: file = path.splitext(path.split(filename)[1])[0] labels[filename] = _get_transcript_from_buffer(contents, file) return labels Example 10 def read_source_sentences(inference_input_file): """Load inference data.""" with codecs.getreader("utf-8")( tf.io.gfile.GFile(inference_input_file, mode="rb")) as f: inference_data = f.read().splitlines() return inference_data Example 11 def read_sentences(self): lines = self.text.split('\n') raw = [sentence for inner_list in lines for sentence in sent_tokenize(inner_list)] return [[w.lower() for w in word_tokenize(s) if w not in string.punctuation] for s in raw] Example 12 def read_sentences(): id_to_sents = collections.defaultdict(list) with open(OPTS.batch_file) as f: reader = csv.DictReader(f) for row in reader: input_qids = row['Input.qids'].split('\t') input_sents = row['Input.sents'].split('\t') ans_is_good = row['Answer.is-good'].split('\t') ans_responses = row['Answer.responses'].split('\t') for qid, s, is_good, response in zip(input_qids, input_sents, ans_is_good, ans_responses): if is_good == 'yes': response = s if response not in id_to_sents[qid]: id_to_sents[qid].append(response) return id_to_sents Example 13 def read_all_sentences_labels(): with open(path.join(_current_path, 'configs', 'labels'), 'r') as f: contents = f.read().splitlines() labels_dict = dict([(line.split(' ', 1)) for line in contents]) parsed_dict = {} for (k,v) in labels_dict.items(): parsed_sentence = v.replace(' ', '_') parsed_sentence = parsed_sentence.lower() parsed_dict[k] = list(parsed_sentence) return parsed_dict Example 14 def read_sentences(self): self.sentences=[] self.list_nmea.DeleteAllItems() data=self.conf.get('NMEA0183', 'sentences') try: temp_list=eval(data) except:temp_list=[] for ii in temp_list: self.sentences.append(ii) fields=',' for i in ii[1]: if type(i) is str: fields+=i+',' elif type(i) is list: fields+=i[1]+',' self.list_nmea.Append([ii[0],ii[2],fields]) Example 15 def read_conll_sentences_md(f, delim=None, comment_pattern="#"): """Read sentences from a conll file. :param f: `str` The file to read from. :param delim: `str` The token between columns in the file. Note: If there are document annotations in the conll file then they will show up in the meta data for what would be the first sentence of that doc If you have a sentence where the first token is `#` it will show up in the metadata. If this happens you'll need to update you comments to use a different comment pattern, something like `# comment:` I recommend having a space in you patten so it can't show up as a conll token :returns: `Generator[Tuple[List[List[str]], List[List[str]]]` The first element is the list or rows, the second is a list of comment lines that preceded that sentence in the file. """ sentence, meta = [], [] for line in f: line = line.rstrip() # Comments are not allowed in the middle of a sentence so if we find a line that # starts with # but we are in a sentence it must be a # as a token so we should # not skip it. If this is a comment we track it in our meta data list if not sentence and line.startswith(comment_pattern): meta.append(line) continue if len(line) == 0: if sentence: yield sentence, meta sentence, meta = [], [] continue sentence.append(line.split(delim)) if sentence: yield sentence, meta Example 16 def read_conll_sentences(f, delim=None, allow_comments=True, comment_pattern="#"): """Read sentences from a conll file. :param f: `str` The file to read from. :param delim: `str` The token between columns in the file. Note: If you have a sentence where the first token is `#` it will get eaten by the metadata. If this happens you need to set `allow_comments=True` and not have comments in the file. If you have comments in the file and set this then they will show up in the sentences :returns: `Generator[List[List[str]]]` A list of rows representing a sentence. """ sentence = [] for line in f: line = line.rstrip() # Comments are not allowed in the middle of a sentence so if we find a line that # starts with # but we are in a sentence it must be a # as a token so we should # not skip it if allow_comments and not sentence and line.startswith(comment_pattern): continue # Blank lines signal the end of a sentence if len(line) == 0: # If we built a sentence yield it, this check allows multiple blank lines in a row if sentence: yield sentence # Reset the sentence sentence = [] continue # This is a normal row, we split and take the tokens. sentence.append(line.split(delim)) # If we have a sentence then the file didn't end with a new line, we yield the sentence # so we don't lose it if sentence: yield sentence Example 17 def read_sentences(self, filename): sentences = [] extra = dict() example_ids = [] with open(filename) as f: for line in tqdm(f, desc='read'): for s in self.read_line(line): if self.filter_length > 0 and len(s) > self.filter_length: continue if self.include_id: example_id = s[0] s = s[1:] else: example_id = len(sentences) if self.lowercase: s = [w.lower() for w in s] example_ids.append(example_id) sentences.append(s) extra['example_ids'] = example_ids return { "sentences": sentences, "extra": extra } Example 18 def read_sentences(fobj): sentence = [] for line in fobj: if line == "\n": yield sentence sentence = [] continue try: ( pos, word, lemma, tag, tag2, morph, head, func, proj_head, proj_func, ) = line.split() except ValueError: # Word may be unicode whitespace. ( pos, word, lemma, tag, tag2, morph, head, func, proj_head, proj_func, ) = re.split(' *\t*', line.strip()) word = escape_special_chars(word) lemma = escape_special_chars(lemma) morph = morph.replace('|',',') if proj_head == '_': proj_head = head proj_func = func sentence.append( Word( int(pos), word, lemma, tag2, morph, int(head), func, int(proj_head), proj_func)) How do you read a text file in a sentence in a sentence in Python?One way to ensure that your file is closed is to use the with keyword. The readline() method is going to read one line from the file and return that. The readlines() method will read and return a list of all of the lines in the file. An alternative to these different read methods would be to use a for loop .
How do you read a line from a text file in Python?Steps for reading a text file in Python. First, open a text file for reading by using the open() function.. Second, read text from the text file using the file read() , readline() , or readlines() method of the file object.. Third, close the file using the file close() method.. How do you read a sentence in Python?Approach:. Open a file in read mode which contains a string.. Use for loop to read each line from the text file.. Again use for loop to read each word from the line splitted by ' '.. Display each word from each line in the text file.. How do you extract data from a text file in Python?How to extract specific portions of a text file using Python. Make sure you're using Python 3.. Reading data from a text file.. Using "with open". Reading text files line-by-line.. Storing text data in a variable.. Searching text for a substring.. Incorporating regular expressions.. Putting it all together.. |